From d7a7de79f8a6498bae52331f4789adcac76b8e8c Mon Sep 17 00:00:00 2001
From: smaslov-intel <sergey.v.maslov@intel.com>
Date: Thu, 15 Dec 2022 23:44:08 -0800
Subject: [PATCH 01/11] [SYCL] Add Unified Runtime Plugin (#7521)

Add new pi_unified_plugin that implements only piPlatformsGet (via Unified Runtime zerPlatformGet)
The next step is to add a new backend to SYCL and actually start calling to pi_unified_plugin

Signed-off-by: Sergey V Maslov <sergey.v.maslov@intel.com>
Co-authored-by: Pavel V Chupin <pavel.v.chupin@intel.com>
Co-authored-by: Steffen Larsen <steffen.larsen@intel.com>
---
 sycl/plugins/level_zero/CMakeLists.txt        | 177 ++++++---------
 sycl/plugins/level_zero/pi_level_zero.cpp     | 211 +-----------------
 sycl/plugins/level_zero/pi_level_zero.hpp     |  17 +-
 sycl/plugins/level_zero/tracing.cpp           |   4 +-
 sycl/plugins/level_zero/ur_bindings.hpp       |  17 ++
 sycl/plugins/unified_runtime/CMakeLists.txt   |  98 ++++----
 sycl/plugins/unified_runtime/pi2ur.cpp        |  31 ---
 sycl/plugins/unified_runtime/pi2ur.hpp        |  32 ++-
 .../unified_runtime/pi_unified_runtime.cpp    |  17 ++
 .../unified_runtime/pi_unified_runtime.hpp    |   8 +
 .../adapters/level_zero/ur_level_zero.cpp     | 188 +++++++++++++++-
 .../adapters/level_zero/ur_level_zero.hpp     |  19 +-
 sycl/plugins/unified_runtime/ur/ur.cpp        |  26 +++
 sycl/plugins/unified_runtime/{ => ur}/ur.hpp  |  49 +++-
 sycl/plugins/unified_runtime/ur_bindings.hpp  |  17 ++
 .../basic_tests/interop-backend-traits.cpp    |   4 +-
 .../basic_tests/interop-level-zero-2020.cpp   |   2 +-
 sycl/test/lit.cfg.py                          |   3 +-
 sycl/test/lit.site.cfg.py.in                  |   1 +
 sycl/tools/sycl-trace/CMakeLists.txt          |   4 +-
 sycl/tools/sycl-trace/ze_trace_collector.cpp  |   2 +-
 21 files changed, 495 insertions(+), 432 deletions(-)
 create mode 100644 sycl/plugins/level_zero/ur_bindings.hpp
 mode change 100755 => 100644 sycl/plugins/unified_runtime/pi2ur.hpp
 create mode 100644 sycl/plugins/unified_runtime/pi_unified_runtime.cpp
 create mode 100644 sycl/plugins/unified_runtime/pi_unified_runtime.hpp
 rename sycl/plugins/unified_runtime/{ => ur}/adapters/level_zero/ur_level_zero.cpp (50%)
 rename sycl/plugins/unified_runtime/{ => ur}/adapters/level_zero/ur_level_zero.hpp (92%)
 create mode 100644 sycl/plugins/unified_runtime/ur/ur.cpp
 rename sycl/plugins/unified_runtime/{ => ur}/ur.hpp (54%)
 mode change 100755 => 100644
 create mode 100644 sycl/plugins/unified_runtime/ur_bindings.hpp

diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt
index 7b0ce7eaeeaab..5ee92b16e2e91 100755
--- a/sycl/plugins/level_zero/CMakeLists.txt
+++ b/sycl/plugins/level_zero/CMakeLists.txt
@@ -1,131 +1,101 @@
 # PI Level Zero plugin library
 
-if(MSVC)
-  set(LEVEL_ZERO_LOADER
-      "${LLVM_LIBRARY_OUTPUT_INTDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}ze_loader${CMAKE_STATIC_LIBRARY_SUFFIX}")
-else()
-  set(LEVEL_ZERO_LOADER
-      "${LLVM_LIBRARY_OUTPUT_INTDIR}/${CMAKE_SHARED_LIBRARY_PREFIX}ze_loader${CMAKE_SHARED_LIBRARY_SUFFIX}")
-endif()
-
 if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
   message(STATUS "Download Level Zero loader and headers from github.com")
-  if (CMAKE_C_COMPILER)
-    list(APPEND AUX_CMAKE_FLAGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER})
-  endif()
-  if (CMAKE_CXX_COMPILER)
-    list(APPEND AUX_CMAKE_FLAGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER})
-  endif()
-  file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader_build)
-  set(LEVEL_ZERO_LOADER_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/level_zero/level_zero_loader")
-  if (NOT DEFINED SYCL_EP_LEVEL_ZERO_LOADER_SKIP_AUTO_UPDATE)
-    set(SYCL_EP_LEVEL_ZERO_LOADER_SKIP_AUTO_UPDATE ${SYCL_EXTERNAL_PROJECTS_SKIP_AUTO_UPDATE})
-  endif()
-  set(LEVEL_ZERO_LOADER_SOURCE_VERSION v1.8.8)
-
-  #TODO: Replace ExternalProject with FetchContent for better maintainance and
-  #      cmake files simplification
-  include(ExternalProject)
-  ExternalProject_Add(level-zero-loader
-    GIT_REPOSITORY    https://github.com/oneapi-src/level-zero.git
-    GIT_TAG           ${LEVEL_ZERO_LOADER_SOURCE_VERSION}
-    UPDATE_DISCONNECTED ${SYCL_EP_LEVEL_ZERO_LOADER_SKIP_AUTO_UPDATE}
-    SOURCE_DIR        ${LEVEL_ZERO_LOADER_SOURCE_DIR}
-    BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader_build"
-    INSTALL_DIR       "${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader_install"
-    CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-               -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
-               -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-               -DCMAKE_INSTALL_LIBDIR:PATH=lib${LLVM_LIBDIR_SUFFIX}
-               ${AUX_CMAKE_FLAGS}
-               LOG_DOWNLOAD 1
-               LOG_UPDATE 1
-               LOG_CONFIGURE 1
-               LOG_BUILD 1
-               LOG_INSTALL 1
-    STEP_TARGETS      configure,build,install
-    DEPENDS           OpenCL-Headers
-    BUILD_BYPRODUCTS ${LEVEL_ZERO_LOADER}
-  )
-  ExternalProject_Add_Step(level-zero-loader llvminstall
-    COMMAND ${CMAKE_COMMAND} -E copy_directory <INSTALL_DIR>/lib/ ${LLVM_BINARY_DIR}/lib
-    COMMAND ${CMAKE_COMMAND} -E copy_directory <INSTALL_DIR>/include/ ${LLVM_BINARY_DIR}/include/sycl
-    COMMENT "Installing level-zero-loader into the LLVM binary directory"
-    DEPENDEES install
-  )
 
-  install(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader_install/"
-    DESTINATION "."
-    COMPONENT level-zero-loader
-  )
+  set(LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git")
+  set(LEVEL_ZERO_LOADER_TAG v1.8.8)
 
-  set(LEVEL_ZERO_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader_install/include/
-	  CACHE INTERNAL "Path containing Level_Zero header files.")
-else()
-  file(GLOB LEVEL_ZERO_LIBRARY_SRC "${LEVEL_ZERO_LIBRARY}*")
-  get_filename_component(LEVEL_ZERO_LIB_NAME ${LEVEL_ZERO_LIBRARY} NAME)
-  add_custom_target(level-zero-loader
-      DEPENDS
-	  ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LEVEL_ZERO_LIB_NAME}
-	  ${LLVM_BINARY_DIR}/include/sycl/level_zero
-  )
-  add_custom_command(
-      OUTPUT
-	  ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LEVEL_ZERO_LIB_NAME}
-	  ${LLVM_BINARY_DIR}/include/sycl/level_zero
-      COMMENT
-	  "Copying Level Zero loader and headers"
-      COMMAND
-	  ${CMAKE_COMMAND} -E copy ${LEVEL_ZERO_LIBRARY_SRC} ${LLVM_LIBRARY_OUTPUT_INTDIR}
-      COMMAND
-	  ${CMAKE_COMMAND} -E copy_directory ${LEVEL_ZERO_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include/sycl
-      DEPENDS
-	  ${LEVEL_ZERO_LIBRARY}
-          ${LEVEL_ZERO_INCLUDE_DIR}
+  # Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104
+  set(CMAKE_INCLUDE_CURRENT_DIR OFF)
+
+  message(STATUS "Will fetch Level Zero Loader from ${LEVEL_ZERO_LOADER_REPO}")
+  include(FetchContent)
+  FetchContent_Declare(level-zero-loader
+    GIT_REPOSITORY    ${LEVEL_ZERO_LOADER_REPO}
+    GIT_TAG           ${LEVEL_ZERO_LOADER_TAG}
   )
-endif()
 
-list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS level-zero-loader level-zero-headers)
+  # Suppress some pedantic warnings for Level Zero build
+  set(CMAKE_CXX_FLAGS_BAK "${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-but-set-variable")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pedantic")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-stringop-truncation")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option")
 
-include_directories("${LEVEL_ZERO_INCLUDE_DIR}")
+  FetchContent_MakeAvailable(level-zero-loader)
+  FetchContent_GetProperties(level-zero-loader)
+
+  # Restore original flags
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_BAK}")
+
+  set(LEVEL_ZERO_LIBRARY ze_loader)
+  set(LEVEL_ZERO_INCLUDE_DIR
+    ${level-zero-loader_SOURCE_DIR}/include CACHE PATH "Path to Level Zero Headers")
+endif()
+
+add_library (LevelZeroLoader INTERFACE)
+target_link_libraries(LevelZeroLoader
+  INTERFACE "${LEVEL_ZERO_LIBRARY}"
+)
 
 add_library (LevelZeroLoader-Headers INTERFACE)
-add_library (LevelZeroLoader::Headers ALIAS LevelZeroLoader-Headers)
 target_include_directories(LevelZeroLoader-Headers
   INTERFACE "${LEVEL_ZERO_INCLUDE_DIR}"
 )
 
 if (SYCL_ENABLE_XPTI_TRACING)
   set(XPTI_PROXY_SRC "${CMAKE_SOURCE_DIR}/../xpti/src/xpti_proxy.cpp")
+  set(XPTI_INCLUDE "${CMAKE_SOURCE_DIR}/../xpti/include")
+  set(XPTI_LIBS "${CMAKE_DL_LIBS}")
 endif()
 
+find_package(Python3 REQUIRED)
+
+add_custom_target(ze-api
+  COMMAND ${Python3_EXECUTABLE}
+  ${CMAKE_CURRENT_SOURCE_DIR}/ze_api_generator.py
+  ${LEVEL_ZERO_INCLUDE_DIR}/ze_api.h
+  BYPRODUCTS
+    ${CMAKE_CURRENT_BINARY_DIR}/ze_api.def
+  )
+
 find_package(Threads REQUIRED)
 
 add_sycl_plugin(level_zero
   SOURCES
-    "${sycl_inc_dir}/sycl/detail/pi.h"
-    "${CMAKE_CURRENT_SOURCE_DIR}/pi_level_zero.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/pi_level_zero.hpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/usm_allocator.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/usm_allocator.hpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/tracing.cpp"
+    # These are short-term shared with Unified Runtime
+    # The two plugins define a few things differrently so must
+    # be built separately. This difference is spelled in
+    # their "ur_bindings.hpp" files.
+    #
+    "ur_bindings.hpp"
+    "../unified_runtime/pi2ur.hpp"
+    "../unified_runtime/pi2ur.cpp"
+    "../unified_runtime/ur/ur.hpp"
+    "../unified_runtime/ur/ur.cpp"
+    "../unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp"
+    "../unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp"
+    # Following are the PI Level-Zero Plugin only codes.
+    "pi_level_zero.cpp"
+    "pi_level_zero.hpp"
+    "usm_allocator.cpp"
+    "usm_allocator.hpp"
+    "tracing.cpp"
     ${XPTI_PROXY_SRC}
+  INCLUDE_DIRS
+    ${CMAKE_CURRENT_BINARY_DIR} # for ze_api.def
+    ${CMAKE_CURRENT_SOURCE_DIR} # for Level-Zero Plugin "ur_bindings.hpp"
+    ${CMAKE_CURRENT_SOURCE_DIR}/../unified_runtime # for Unified Runtime
+    ${XPTI_INCLUDE}
   LIBRARIES
-    "${LEVEL_ZERO_LOADER}"
+    LevelZeroLoader-Headers
+    UnifiedRuntime-Headers
+    LevelZeroLoader
     Threads::Threads
-    unified_runtime
+    ${XPTI_LIBS}
 )
 
-find_package(Python3 REQUIRED)
-
-add_custom_target(ze-api
-  COMMAND ${Python3_EXECUTABLE}
-  ${CMAKE_CURRENT_SOURCE_DIR}/ze_api_generator.py
-  ${LEVEL_ZERO_INCLUDE_DIR}/level_zero/ze_api.h
-  BYPRODUCTS
-    ${CMAKE_CURRENT_BINARY_DIR}/ze_api.def
-  )
-target_include_directories(pi_level_zero PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 add_dependencies(pi_level_zero ze-api)
 
 if (SYCL_ENABLE_XPTI_TRACING)
@@ -133,11 +103,4 @@ if (SYCL_ENABLE_XPTI_TRACING)
     XPTI_ENABLE_INSTRUMENTATION
     XPTI_STATIC_LIBRARY
   )
-  target_include_directories(pi_level_zero PRIVATE "${CMAKE_SOURCE_DIR}/../xpti/include")
-  target_link_libraries(pi_level_zero PRIVATE ${CMAKE_DL_LIBS})
-endif()
-
-if (TARGET level-zero-loader)
-  add_dependencies(ze-api level-zero-loader)
-  add_dependencies(pi_level_zero level-zero-loader)
 endif()
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 426cce88fbba1..6c93a6e82c0aa 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -24,8 +24,9 @@
 #include <thread>
 #include <utility>
 
-#include <level_zero/zet_api.h>
+#include <zet_api.h>
 
+#include "ur_bindings.hpp"
 #include "usm_allocator.hpp"
 
 extern "C" {
@@ -90,19 +91,13 @@ static const bool ReuseDiscardedEvents = [] {
   return std::stoi(ReuseDiscardedEventsFlag) > 0;
 }();
 
-// Controls PI level tracing prints.
-static bool PrintPiTrace = false;
-
 // Controls support of the indirect access kernels and deferred memory release.
 static const bool IndirectAccessTrackingEnabled = [] {
   return std::getenv("SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY") !=
          nullptr;
 }();
 
-// This will count the calls to Level-Zero
-static std::map<const char *, int> *ZeCallCount = nullptr;
-
-// Map from L0 to PI result
+// Map from L0 to PI result.
 static inline pi_result mapError(ze_result_t Result) {
   return ur2piResult(ze2urResult(Result));
 }
@@ -118,7 +113,7 @@ static inline pi_result mapError(ze_result_t Result) {
 // Trace an internal PI call; returns in case of an error.
 #define PI_CALL(Call)                                                          \
   {                                                                            \
-    if (PrintPiTrace)                                                          \
+    if (PrintTrace)                                                            \
       fprintf(stderr, "PI ---> %s\n", #Call);                                  \
     pi_result Result = (Call);                                                 \
     if (Result != PI_SUCCESS)                                                  \
@@ -358,15 +353,6 @@ static bool CopyEngineRequested(pi_device Device) {
 }
 
 // Global variables used in PI_Level_Zero
-// Note we only create a simple pointer variables such that C++ RT won't
-// deallocate them automatically at the end of the main program.
-// The heap memory allocated for these global variables reclaimed only when
-// Sycl RT calls piTearDown().
-static std::vector<pi_platform> *PiPlatformsCache =
-    new std::vector<pi_platform>;
-static sycl::detail::SpinLock *PiPlatformsCacheMutex =
-    new sycl::detail::SpinLock;
-static bool PiPlatformCachePopulated = false;
 
 pi_result
 _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool,
@@ -476,55 +462,6 @@ static pi_result enqueueMemCopyRectHelper(
     const pi_event *EventWaitList, pi_event *Event,
     bool PreferCopyEngine = false);
 
-inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) {
-  switch (ZeError) {
-#define ZE_ERRCASE(ERR)                                                        \
-  case ERR:                                                                    \
-    ErrorString = "" #ERR;                                                     \
-    break;
-
-    ZE_ERRCASE(ZE_RESULT_SUCCESS)
-    ZE_ERRCASE(ZE_RESULT_NOT_READY)
-    ZE_ERRCASE(ZE_RESULT_ERROR_DEVICE_LOST)
-    ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY)
-    ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)
-    ZE_ERRCASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS)
-    ZE_ERRCASE(ZE_RESULT_ERROR_NOT_AVAILABLE)
-    ZE_ERRCASE(ZE_RESULT_ERROR_UNINITIALIZED)
-    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION)
-    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ARGUMENT)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE)
-    ZE_ERRCASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SIZE)
-    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE)
-    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ENUMERATION)
-    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION)
-    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE)
-    ZE_ERRCASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS)
-    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED)
-    ZE_ERRCASE(ZE_RESULT_ERROR_UNKNOWN)
-
-#undef ZE_ERRCASE
-  default:
-    assert(false && "Unexpected Error code");
-  } // switch
-}
-
 // Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR
 constexpr size_t MaxMessageSize = 256;
 thread_local pi_result ErrorMessageCode = PI_SUCCESS;
@@ -544,26 +481,6 @@ pi_result piPluginGetLastError(char **message) {
   return ErrorMessageCode;
 }
 
-ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName,
-                           const char *ZeArgs, bool TraceError) {
-  zePrint("ZE ---> %s%s\n", ZeName, ZeArgs);
-
-  if (ZeDebug & ZE_DEBUG_CALL_COUNT) {
-    ++(*ZeCallCount)[ZeName];
-  }
-
-  if (ZeResult && TraceError) {
-    const char *ErrorString = "Unknown";
-    zeParseError(ZeResult, ErrorString);
-    zePrint("Error (%s) in %s\n", ErrorString, ZeName);
-  }
-  return ZeResult;
-}
-
-#define PI_ASSERT(condition, error)                                            \
-  if (!(condition))                                                            \
-    return error;
-
 bool _pi_queue::doReuseDiscardedEvents() {
   return ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents();
 }
@@ -1730,7 +1647,7 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
   // traces incurs much different timings than real execution
   // ansyway, and many regression tests use it.
   //
-  bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr;
+  bool CurrentlyEmpty = !PrintTrace && this->LastCommandEvent == nullptr;
 
   // The list can be empty if command-list only contains signals of proxy
   // events. It is possible that executeCommandList is called twice for the same
@@ -2385,117 +2302,7 @@ checkUnresolvedSymbols(ze_module_handle_t ZeModule,
 
 pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
                          pi_uint32 *NumPlatforms) {
-
-  static const char *PiTrace = std::getenv("SYCL_PI_TRACE");
-  static const int PiTraceValue = PiTrace ? std::stoi(PiTrace) : 0;
-  if (PiTraceValue == -1 || PiTraceValue == 2) { // Means print all PI traces
-    PrintPiTrace = true;
-  }
-
-  static std::once_flag ZeCallCountInitialized;
-  try {
-    std::call_once(ZeCallCountInitialized, []() {
-      if (ZeDebug & ZE_DEBUG_CALL_COUNT) {
-        ZeCallCount = new std::map<const char *, int>;
-      }
-    });
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  if (NumEntries == 0 && Platforms != nullptr) {
-    return PI_ERROR_INVALID_VALUE;
-  }
-  if (Platforms == nullptr && NumPlatforms == nullptr) {
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  // Setting these environment variables before running zeInit will enable the
-  // validation layer in the Level Zero loader.
-  if (ZeDebug & ZE_DEBUG_VALIDATION) {
-    setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1");
-    setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1");
-  }
-
-  // Enable SYSMAN support for obtaining the PCI address
-  // and maximum memory bandwidth.
-  if (getenv("SYCL_ENABLE_PCI") != nullptr) {
-    setEnvVar("ZES_ENABLE_SYSMAN", "1");
-  }
-
-  // TODO: We can still safely recover if something goes wrong during the init.
-  // Implement handling segfault using sigaction.
-
-  // We must only initialize the driver once, even if piPlatformsGet() is called
-  // multiple times.  Declaring the return value as "static" ensures it's only
-  // called once.
-  static ze_result_t ZeResult = ZE_CALL_NOCHECK(zeInit, (0));
-
-  // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms.
-  if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
-    PI_ASSERT(NumPlatforms != 0, PI_ERROR_INVALID_VALUE);
-    *NumPlatforms = 0;
-    return PI_SUCCESS;
-  }
-
-  if (ZeResult != ZE_RESULT_SUCCESS) {
-    zePrint("zeInit: Level Zero initialization failure\n");
-    return mapError(ZeResult);
-  }
-
-  // Cache pi_platforms for reuse in the future
-  // It solves two problems;
-  // 1. sycl::platform equality issue; we always return the same pi_platform.
-  // 2. performance; we can save time by immediately return from cache.
-  //
-
-  const std::lock_guard<sycl::detail::SpinLock> Lock{*PiPlatformsCacheMutex};
-  if (!PiPlatformCachePopulated) {
-    try {
-      // Level Zero does not have concept of Platforms, but Level Zero driver is
-      // the closest match.
-      uint32_t ZeDriverCount = 0;
-      ZE_CALL(zeDriverGet, (&ZeDriverCount, nullptr));
-      if (ZeDriverCount == 0) {
-        PiPlatformCachePopulated = true;
-      } else {
-        std::vector<ze_driver_handle_t> ZeDrivers;
-        ZeDrivers.resize(ZeDriverCount);
-
-        ZE_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data()));
-        for (uint32_t I = 0; I < ZeDriverCount; ++I) {
-          pi_platform Platform = new _pi_platform(ZeDrivers[I]);
-          // Save a copy in the cache for future uses.
-          PiPlatformsCache->push_back(Platform);
-
-          pi_result Result = Platform->initialize();
-          if (Result != PI_SUCCESS) {
-            return Result;
-          }
-        }
-        PiPlatformCachePopulated = true;
-      }
-    } catch (const std::bad_alloc &) {
-      return PI_ERROR_OUT_OF_HOST_MEMORY;
-    } catch (...) {
-      return PI_ERROR_UNKNOWN;
-    }
-  }
-
-  // Populate returned platforms from the cache.
-  if (Platforms) {
-    PI_ASSERT(NumEntries <= PiPlatformsCache->size(),
-              PI_ERROR_INVALID_PLATFORM);
-    std::copy_n(PiPlatformsCache->begin(), NumEntries, Platforms);
-  }
-
-  if (NumPlatforms) {
-    *NumPlatforms = PiPlatformsCache->size();
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piPlatformsGet(NumEntries, Platforms, NumPlatforms);
 }
 
 pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName,
@@ -3592,10 +3399,10 @@ pi_result piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle,
   // TODO: maybe we should populate cache of platforms if it wasn't already.
   // For now assert that is was populated.
   PI_ASSERT(PiPlatformCachePopulated, PI_ERROR_INVALID_VALUE);
-  const std::lock_guard<sycl::detail::SpinLock> Lock{*PiPlatformsCacheMutex};
+  const std::lock_guard<SpinLock> Lock{*PiPlatformsCacheMutex};
 
   pi_device Dev = nullptr;
-  for (auto &ThePlatform : *PiPlatformsCache) {
+  for (pi_platform ThePlatform : *PiPlatformsCache) {
     Dev = ThePlatform->getDeviceFromNativeHandle(ZeDevice);
     if (Dev) {
       // Check that the input Platform, if was given, matches the found one.
@@ -9206,7 +9013,7 @@ pi_result piTearDown(void *PluginParameter) {
   (void)PluginParameter;
   bool LeakFound = false;
   // reclaim pi_platform objects here since we don't have piPlatformRelease.
-  for (pi_platform &Platform : *PiPlatformsCache) {
+  for (pi_platform Platform : *PiPlatformsCache) {
     delete Platform;
   }
   delete PiPlatformsCache;
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
index c876a32e05209..820bbb5d7a2a0 100644
--- a/sycl/plugins/level_zero/pi_level_zero.hpp
+++ b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -42,13 +42,13 @@
 #include <unordered_set>
 #include <vector>
 
-#include <level_zero/ze_api.h>
-#include <level_zero/zes_api.h>
 #include <sycl/detail/iostream_proxy.hpp>
+#include <ze_api.h>
+#include <zes_api.h>
 
 // Share code between this PI L0 Plugin and UR L0 Adapter
-#include <adapters/level_zero/ur_level_zero.hpp>
 #include <pi2ur.hpp>
+#include <ur/adapters/level_zero/ur_level_zero.hpp>
 
 #include "usm_allocator.hpp"
 
@@ -58,7 +58,7 @@ template <class To, class From> To pi_cast(From Value) {
   return (To)(Value);
 }
 
-template <> uint32_t pi_cast(uint64_t Value) {
+template <> uint32_t inline pi_cast(uint64_t Value) {
   // Cast value and check that we don't lose any information.
   uint32_t CastedValue = (uint32_t)(Value);
   assert((uint64_t)CastedValue == Value);
@@ -180,13 +180,8 @@ struct MemAllocRecord : _pi_object {
 // Define the types that are opaque in pi.h in a manner suitabale for Level Zero
 // plugin
 
-struct _pi_platform : public _ur_level_zero_platform {
-  _pi_platform(ze_driver_handle_t Driver) : _ur_level_zero_platform{Driver} {}
-
-  // Performs initialization of a newly constructed PI platform.
-  pi_result initialize() {
-    return ur2piResult(_ur_level_zero_platform::initialize());
-  }
+struct _pi_platform : public _ur_platform_handle_t {
+  using _ur_platform_handle_t::_ur_platform_handle_t;
 
   // Cache pi_devices for reuse
   std::vector<std::unique_ptr<_pi_device>> PiDevicesCache;
diff --git a/sycl/plugins/level_zero/tracing.cpp b/sycl/plugins/level_zero/tracing.cpp
index 32ee0f31fec90..17ea79359ea1f 100644
--- a/sycl/plugins/level_zero/tracing.cpp
+++ b/sycl/plugins/level_zero/tracing.cpp
@@ -8,9 +8,9 @@
 
 #include "xpti/xpti_data_types.h"
 #include <exception>
-#include <level_zero/layers/zel_tracing_api.h>
-#include <level_zero/ze_api.h>
+#include <layers/zel_tracing_api.h>
 #include <xpti/xpti_trace_framework.h>
+#include <ze_api.h>
 
 #include <sycl/detail/iostream_proxy.hpp>
 
diff --git a/sycl/plugins/level_zero/ur_bindings.hpp b/sycl/plugins/level_zero/ur_bindings.hpp
new file mode 100644
index 0000000000000..333447613f575
--- /dev/null
+++ b/sycl/plugins/level_zero/ur_bindings.hpp
@@ -0,0 +1,17 @@
+//===------ ur_bindings.hpp - Complete definitions of UR handles -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+#pragma once
+
+#include "pi_level_zero.hpp"
+#include <zer_api.h>
+
+// Make the Unified Runtime handles definition complete.
+// This is used in various "create" API where new handles are allocated.
+struct _zer_platform_handle_t : public _pi_platform {
+  using _pi_platform::_pi_platform;
+};
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index f442db4a4f942..783531aab932b 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -1,35 +1,31 @@
 # PI Unified Runtime plugin library
 #
+if (NOT DEFINED UNIFIED_RUNTIME_INCLUDE_DIR)
+  include(FetchContent)
 
-include(FetchContent)
+  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
+  set(UNIFIED_RUNTIME_TAG fd711c920acc4434cb52ff18b078c082d9d7f44d)
 
-set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-set(UNIFIED_RUNTIME_TAG fd711c920acc4434cb52ff18b078c082d9d7f44d)
-
-message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}")
-FetchContent_Declare(unified-runtime
-  GIT_REPOSITORY    ${UNIFIED_RUNTIME_REPO}
-  GIT_TAG           ${UNIFIED_RUNTIME_TAG}
-)
-
-FetchContent_MakeAvailable(unified-runtime)
-FetchContent_GetProperties(unified-runtime)
-
-set(UNIFIED_RUNTIME_SOURCE_DIR
-  ${unified-runtime_SOURCE_DIR} CACHE PATH "Path to Unified Runtime Headers")
-set(UNIFIED_RUNTIME_INCLUDE_DIR "${UNIFIED_RUNTIME_SOURCE_DIR}/include")
+  message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}")
+  FetchContent_Declare(unified-runtime
+    GIT_REPOSITORY    ${UNIFIED_RUNTIME_REPO}
+    GIT_TAG           ${UNIFIED_RUNTIME_TAG}
+  )
 
+  FetchContent_MakeAvailable(unified-runtime)
+  FetchContent_GetProperties(unified-runtime)
 
-#include_directories("${LEVEL_ZERO_INCLUDE_DIR}")
-include_directories("${UNIFIED_RUNTIME_INCLUDE_DIR}")
+  set(UNIFIED_RUNTIME_SOURCE_DIR
+    ${unified-runtime_SOURCE_DIR} CACHE PATH "Path to Unified Runtime Headers")
+  set(UNIFIED_RUNTIME_INCLUDE_DIR "${UNIFIED_RUNTIME_SOURCE_DIR}/include")
+endif()
 
 add_library (UnifiedRuntime-Headers INTERFACE)
 target_include_directories(UnifiedRuntime-Headers
-  INTERFACE "${UNIFIED_RUNTIME_INCLUDE_DIR}"
+  INTERFACE
+    "${UNIFIED_RUNTIME_INCLUDE_DIR}"
 )
 
-list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS unified-runtime)
-
 find_package(Threads REQUIRED)
 
 #
@@ -37,42 +33,26 @@ find_package(Threads REQUIRED)
 # So what we really build is the Unified Runtime with Level Zero Adapter
 # together.
 #
-# TODO: begin
-#       Unified Runtime today is not a plugin yet. It's experimental static
-#       library that's linked into level_zero plugin. As soon as it's ready to
-#       become a plugin code below should be replaced with the following:
-#add_sycl_plugin(unified_runtime
-#  SOURCES
-#    "${sycl_inc_dir}/sycl/detail/pi.h"
-#    "${CMAKE_CURRENT_SOURCE_DIR}/pi2ur.cpp"
-#    "${CMAKE_CURRENT_SOURCE_DIR}/pi2ur.hpp"
-#    "${CMAKE_CURRENT_SOURCE_DIR}/ur.hpp"
-#    "${CMAKE_CURRENT_SOURCE_DIR}/adapters/level_zero/ur_level_zero.hpp"
-#    "${CMAKE_CURRENT_SOURCE_DIR}/adapters/level_zero/ur_level_zero.cpp"
-#   LIBRARIES
-#    Threads::Threads
-#    UnifiedRuntime-Headers
-#    LevelZeroLoader-Headers
-#)
-
-add_library(unified_runtime STATIC
-  "${sycl_inc_dir}/sycl/detail/pi.h"
-  "${CMAKE_CURRENT_SOURCE_DIR}/ur.hpp"
-  "${CMAKE_CURRENT_SOURCE_DIR}/adapters/level_zero/ur_level_zero.hpp"
-  "${CMAKE_CURRENT_SOURCE_DIR}/adapters/level_zero/ur_level_zero.cpp"
+add_sycl_plugin(unified_runtime
+  SOURCES
+    # These are short-term shared with Unified Runtime
+    # The two plugins define a few things differrently so must
+    # be built separately. This difference is spelled in
+    # their "ur_bindings.hpp" files.
+    #
+    "ur_bindings.hpp"
+    "pi2ur.hpp"
+    "pi2ur.cpp"
+    "ur/ur.hpp"
+    "ur/ur.cpp"
+    "ur/adapters/level_zero/ur_level_zero.hpp"
+    "ur/adapters/level_zero/ur_level_zero.cpp"
+    # These below belong to Unified Runtime PI Plugin only
+    "pi_unified_runtime.hpp"
+    "pi_unified_runtime.cpp"
+    LIBRARIES
+    Threads::Threads
+    UnifiedRuntime-Headers
+    LevelZeroLoader-Headers
+    LevelZeroLoader
 )
-
-target_include_directories(unified_runtime INTERFACE
-  "${UNIFIED_RUNTIME_INCLUDE_DIR}"
-  "${CMAKE_CURRENT_SOURCE_DIR}"
-)
-
-target_link_libraries(unified_runtime PRIVATE
-  UnifiedRuntime-Headers
-  LevelZeroLoader-Headers
-)
-
-# TODO: end
-
-add_dependencies(unified_runtime ze-api)
-
diff --git a/sycl/plugins/unified_runtime/pi2ur.cpp b/sycl/plugins/unified_runtime/pi2ur.cpp
index a10ca7534ef14..a80fa92f55c31 100644
--- a/sycl/plugins/unified_runtime/pi2ur.cpp
+++ b/sycl/plugins/unified_runtime/pi2ur.cpp
@@ -8,34 +8,3 @@
 
 // This thin layer performs conversion from PI API to Unified Runtime API
 // TODO: remove when SYCL RT is changed to talk in UR directly
-
-#include <pi2ur.hpp>
-#include <ur.hpp>
-
-// Early exits on any error
-#define HANDLE_ERRORS(urCall)                                                  \
-  if (auto Result = urCall)                                                    \
-    return ur2piResult(Result);
-
-__SYCL_EXPORT pi_result piPlatformsGet(pi_uint32 num_entries,
-                                       pi_platform *platforms,
-                                       pi_uint32 *num_platforms) {
-
-  // https://spec.oneapi.io/unified-runtime/latest/core/api.html#zerplatformget
-
-  uint32_t Count = num_entries;
-  auto phPlatforms = reinterpret_cast<zer_platform_handle_t *>(platforms);
-  HANDLE_ERRORS(zerPlatformGet(&Count, phPlatforms));
-  if (*num_platforms) {
-    *num_platforms = Count;
-  }
-  return PI_SUCCESS;
-}
-
-__SYCL_EXPORT pi_result piPlatformGetInfo(pi_platform platform,
-                                          pi_platform_info param_name,
-                                          size_t param_value_size,
-                                          void *param_value,
-                                          size_t *param_value_size_ret) {
-  die("Unified Runtime: piPlatformGetInfo is not implemented");
-}
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
old mode 100755
new mode 100644
index d461bac4a2cfb..52084d5a04aaa
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -1,4 +1,4 @@
-//===---------------- pi2ur.cpp - PI API to UR API  --------------------==//
+//===---------------- pi2ur.hpp - PI API to UR API  --------------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,6 +11,7 @@
 
 #include "zer_api.h"
 #include <sycl/detail/pi.h>
+#include <ur/ur.hpp>
 
 // Map of UR error codes to PI error codes
 static pi_result ur2piResult(zer_result_t urResult) {
@@ -22,6 +23,7 @@ static pi_result ur2piResult(zer_result_t urResult) {
   //
   static std::unordered_map<zer_result_t, pi_result> ErrorMapping = {
       {ZER_RESULT_SUCCESS, PI_SUCCESS},
+      {ZER_RESULT_ERROR_UNKNOWN, PI_ERROR_UNKNOWN},
       {ZER_RESULT_ERROR_DEVICE_LOST, PI_ERROR_DEVICE_NOT_FOUND},
       {ZER_RESULT_INVALID_OPERATION, PI_ERROR_INVALID_OPERATION},
       {ZER_RESULT_INVALID_PLATFORM, PI_ERROR_INVALID_PLATFORM},
@@ -42,3 +44,31 @@ static pi_result ur2piResult(zer_result_t urResult) {
   }
   return It->second;
 }
+
+// Early exits on any error
+#define HANDLE_ERRORS(urCall)                                                  \
+  if (auto Result = urCall)                                                    \
+    return ur2piResult(Result);
+
+namespace pi2ur {
+inline pi_result piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
+                                pi_uint32 *num_platforms) {
+
+  // https://spec.oneapi.io/unified-runtime/latest/core/api.html#zerplatformget
+
+  uint32_t Count = num_entries;
+  auto phPlatforms = reinterpret_cast<zer_platform_handle_t *>(platforms);
+  HANDLE_ERRORS(zerPlatformGet(&Count, phPlatforms));
+  if (num_platforms) {
+    *num_platforms = Count;
+  }
+  return PI_SUCCESS;
+}
+
+inline pi_result piPlatformGetInfo(pi_platform platform,
+                                   pi_platform_info param_name,
+                                   size_t param_value_size, void *param_value,
+                                   size_t *param_value_size_ret) {
+  die("Unified Runtime: piPlatformGetInfo is not implemented");
+}
+} // namespace pi2ur
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
new file mode 100644
index 0000000000000..fe7cab726caa6
--- /dev/null
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
@@ -0,0 +1,17 @@
+//===--- pi_unified_runtime.cpp - Unified Runtime PI Plugin  -----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+
+#include <pi2ur.hpp>
+
+extern "C" {
+__SYCL_EXPORT pi_result piPlatformsGet(pi_uint32 num_entries,
+                                       pi_platform *platforms,
+                                       pi_uint32 *num_platforms) {
+  return pi2ur::piPlatformsGet(num_entries, platforms, num_platforms);
+}
+} // extern "C
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.hpp b/sycl/plugins/unified_runtime/pi_unified_runtime.hpp
new file mode 100644
index 0000000000000..1d40e4b9eb559
--- /dev/null
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.hpp
@@ -0,0 +1,8 @@
+//===--- pi_unified_runtime.hpp - Unified Runtime PI Plugin  -----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+#pragma once
diff --git a/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
similarity index 50%
rename from sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.cpp
rename to sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
index 8160c1256ad22..cfbdda10db5ee 100644
--- a/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
@@ -6,9 +6,11 @@
 //
 //===-----------------------------------------------------------------===//
 
+#include <algorithm>
 #include <string.h>
 
 #include "ur_level_zero.hpp"
+#include <ur_bindings.hpp>
 
 // Define the static class field
 std::mutex ZeCall::GlobalLock;
@@ -49,6 +51,74 @@ bool setEnvVar(const char *name, const char *value) {
       return ze2urResult(Result);                                              \
   }
 
+// This will count the calls to Level-Zero
+std::map<const char *, int> *ZeCallCount = nullptr;
+
+inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) {
+  switch (ZeError) {
+#define ZE_ERRCASE(ERR)                                                        \
+  case ERR:                                                                    \
+    ErrorString = "" #ERR;                                                     \
+    break;
+
+    ZE_ERRCASE(ZE_RESULT_SUCCESS)
+    ZE_ERRCASE(ZE_RESULT_NOT_READY)
+    ZE_ERRCASE(ZE_RESULT_ERROR_DEVICE_LOST)
+    ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY)
+    ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)
+    ZE_ERRCASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS)
+    ZE_ERRCASE(ZE_RESULT_ERROR_NOT_AVAILABLE)
+    ZE_ERRCASE(ZE_RESULT_ERROR_UNINITIALIZED)
+    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION)
+    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ARGUMENT)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE)
+    ZE_ERRCASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SIZE)
+    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE)
+    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ENUMERATION)
+    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION)
+    ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE)
+    ZE_ERRCASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS)
+    ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED)
+    ZE_ERRCASE(ZE_RESULT_ERROR_UNKNOWN)
+
+#undef ZE_ERRCASE
+  default:
+    assert(false && "Unexpected Error code");
+  } // switch
+}
+
+ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName,
+                           const char *ZeArgs, bool TraceError) {
+  zePrint("ZE ---> %s%s\n", ZeName, ZeArgs);
+
+  if (ZeDebug & ZE_DEBUG_CALL_COUNT) {
+    ++(*ZeCallCount)[ZeName];
+  }
+
+  if (ZeResult && TraceError) {
+    const char *ErrorString = "Unknown";
+    zeParseError(ZeResult, ErrorString);
+    zePrint("Error (%s) in %s\n", ErrorString, ZeName);
+  }
+  return ZeResult;
+}
+
 // Specializations for various L0 structures
 template <> ze_structure_type_t getZeStructureType<ze_event_pool_desc_t>() {
   return ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
@@ -157,7 +227,7 @@ template <> zes_structure_type_t getZesStructureType<zes_mem_properties_t>() {
   return ZES_STRUCTURE_TYPE_MEM_PROPERTIES;
 }
 
-zer_result_t _ur_level_zero_platform::initialize() {
+zer_result_t _ur_platform_handle_t::initialize() {
   // Cache driver properties
   ZeStruct<ze_driver_properties_t> ZeDriverProperties;
   ZE_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties));
@@ -208,3 +278,119 @@ zer_result_t _ur_level_zero_platform::initialize() {
 
   return ZER_RESULT_SUCCESS;
 }
+
+ZER_APIEXPORT zer_result_t ZER_APICALL zerPlatformGet(
+    uint32_t
+        *NumPlatforms, ///< [in,out] pointer to the number of platforms.
+                       ///< if count is zero, then the call shall update the
+                       ///< value with the total number of platforms available.
+                       ///< if count is greater than the number of platforms
+                       ///< available, then the call shall update the value with
+                       ///< the correct number of platforms available.
+    zer_platform_handle_t
+        *Platforms ///< [out][optional][range(0, *pCount)] array of handle of
+                   ///< platforms. if count is less than the number of platforms
+                   ///< available, then platform shall only retrieve that number
+                   ///< of platforms.
+) {
+  PI_ASSERT(NumPlatforms, ZER_RESULT_INVALID_VALUE);
+
+  static std::once_flag ZeCallCountInitialized;
+  try {
+    std::call_once(ZeCallCountInitialized, []() {
+      if (ZeDebug & ZE_DEBUG_CALL_COUNT) {
+        ZeCallCount = new std::map<const char *, int>;
+      }
+    });
+  } catch (const std::bad_alloc &) {
+    return ZER_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return ZER_RESULT_ERROR_UNKNOWN;
+  }
+
+  // Setting these environment variables before running zeInit will enable the
+  // validation layer in the Level Zero loader.
+  if (ZeDebug & ZE_DEBUG_VALIDATION) {
+    setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1");
+    setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1");
+  }
+
+  // Enable SYSMAN support for obtaining the PCI address
+  // and maximum memory bandwidth.
+  if (getenv("SYCL_ENABLE_PCI") != nullptr) {
+    setEnvVar("ZES_ENABLE_SYSMAN", "1");
+  }
+
+  // TODO: We can still safely recover if something goes wrong during the init.
+  // Implement handling segfault using sigaction.
+
+  // We must only initialize the driver once, even if piPlatformsGet() is called
+  // multiple times.  Declaring the return value as "static" ensures it's only
+  // called once.
+  static ze_result_t ZeResult = ZE_CALL_NOCHECK(zeInit, (0));
+
+  // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms.
+  if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
+    PI_ASSERT(NumPlatforms != 0, ZER_RESULT_INVALID_VALUE);
+    *NumPlatforms = 0;
+    return ZER_RESULT_SUCCESS;
+  }
+
+  if (ZeResult != ZE_RESULT_SUCCESS) {
+    zePrint("zeInit: Level Zero initialization failure\n");
+    return ze2urResult(ZeResult);
+  }
+
+  // Cache pi_platforms for reuse in the future
+  // It solves two problems;
+  // 1. sycl::platform equality issue; we always return the same pi_platform.
+  // 2. performance; we can save time by immediately return from cache.
+  //
+
+  const std::lock_guard<SpinLock> Lock{*PiPlatformsCacheMutex};
+  if (!PiPlatformCachePopulated) {
+    try {
+      // Level Zero does not have concept of Platforms, but Level Zero driver is
+      // the closest match.
+      uint32_t ZeDriverCount = 0;
+      ZE_CALL(zeDriverGet, (&ZeDriverCount, nullptr));
+      if (ZeDriverCount == 0) {
+        PiPlatformCachePopulated = true;
+      } else {
+        std::vector<ze_driver_handle_t> ZeDrivers;
+        ZeDrivers.resize(ZeDriverCount);
+
+        ZE_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data()));
+        for (uint32_t I = 0; I < ZeDriverCount; ++I) {
+          auto Platform = new _zer_platform_handle_t(ZeDrivers[I]);
+          // Save a copy in the cache for future uses.
+          PiPlatformsCache->push_back(Platform);
+
+          zer_result_t Result = Platform->initialize();
+          if (Result != ZER_RESULT_SUCCESS) {
+            return Result;
+          }
+        }
+        PiPlatformCachePopulated = true;
+      }
+    } catch (const std::bad_alloc &) {
+      return ZER_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    } catch (...) {
+      return ZER_RESULT_ERROR_UNKNOWN;
+    }
+  }
+
+  // Populate returned platforms from the cache.
+  if (Platforms) {
+    PI_ASSERT(*NumPlatforms <= PiPlatformsCache->size(),
+              ZER_RESULT_INVALID_PLATFORM);
+    std::copy_n(PiPlatformsCache->begin(), *NumPlatforms, Platforms);
+  }
+
+  if (*NumPlatforms == 0)
+    *NumPlatforms = PiPlatformsCache->size();
+  else
+    *NumPlatforms = std::min(PiPlatformsCache->size(), (size_t)*NumPlatforms);
+
+  return ZER_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp
similarity index 92%
rename from sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.hpp
rename to sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp
index 90d41957bf50c..6c292139177d7 100644
--- a/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp
@@ -7,6 +7,7 @@
 //===-----------------------------------------------------------------===//
 #pragma once
 
+#include <cassert>
 #include <list>
 #include <map>
 #include <stdarg.h>
@@ -14,10 +15,10 @@
 #include <unordered_map>
 #include <vector>
 
-#include <level_zero/ze_api.h>
-#include <level_zero/zes_api.h>
-#include <ur.hpp>
+#include <ur/ur.hpp>
+#include <ze_api.h>
 #include <zer_api.h>
+#include <zes_api.h>
 
 // Returns the ze_structure_type_t to use in .stype of a structured descriptor.
 // Intentionally not defined; will give an error if no proper specialization
@@ -143,8 +144,8 @@ bool setEnvVar(const char *name, const char *value);
 #define ZE_CALL_NOCHECK(ZeName, ZeArgs)                                        \
   ZeCall().doCall(ZeName ZeArgs, #ZeName, #ZeArgs, false)
 
-struct _ur_level_zero_platform : public _ur_platform {
-  _ur_level_zero_platform(ze_driver_handle_t Driver) : ZeDriver{Driver} {}
+struct _ur_platform_handle_t : public _ur_platform {
+  _ur_platform_handle_t(ze_driver_handle_t Driver) : ZeDriver{Driver} {}
   // Performs initialization of a newly constructed PI platform.
   zer_result_t initialize();
 
@@ -165,7 +166,7 @@ struct _ur_level_zero_platform : public _ur_platform {
   bool ZeDriverModuleProgramExtensionFound{false};
 };
 
-using ur_level_zero_platform = _ur_level_zero_platform *;
+using ur_platform_handle_t = _ur_platform_handle_t *;
 
 class ZeUSMImportExtension {
   // Pointers to functions that import/release host memory into USM
@@ -180,7 +181,7 @@ class ZeUSMImportExtension {
 
   ZeUSMImportExtension() : Enabled{false} {}
 
-  void setZeUSMImport(ur_level_zero_platform Platform) {
+  void setZeUSMImport(ur_platform_handle_t Platform) {
     // Whether env var SYCL_USM_HOSTPTR_IMPORT has been set requesting
     // host ptr import during buffer creation.
     const char *USMHostPtrImportStr = std::getenv("SYCL_USM_HOSTPTR_IMPORT");
@@ -216,4 +217,8 @@ class ZeUSMImportExtension {
   }
 };
 
+// Helper wrapper for working with USM import extension in Level Zero.
 extern ZeUSMImportExtension ZeUSMImport;
+
+// This will count the calls to Level-Zero
+extern std::map<const char *, int> *ZeCallCount;
diff --git a/sycl/plugins/unified_runtime/ur/ur.cpp b/sycl/plugins/unified_runtime/ur/ur.cpp
new file mode 100644
index 0000000000000..c612135a8cdf9
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/ur.cpp
@@ -0,0 +1,26 @@
+
+//===--------- ur.hpp - Unified Runtime  -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "ur.hpp"
+
+// Controls tracing UR calls from within the UR itself.
+bool PrintTrace = [] {
+  const char *Trace = std::getenv("SYCL_PI_TRACE");
+  const int TraceValue = Trace ? std::stoi(Trace) : 0;
+  if (TraceValue == -1 || TraceValue == 2) { // Means print all traces
+    return true;
+  }
+  return false;
+}();
+
+// Apparatus for maintaining immutable cache of platforms.
+std::vector<zer_platform_handle_t> *PiPlatformsCache =
+    new std::vector<zer_platform_handle_t>;
+SpinLock *PiPlatformsCacheMutex = new SpinLock;
+bool PiPlatformCachePopulated = false;
diff --git a/sycl/plugins/unified_runtime/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
old mode 100755
new mode 100644
similarity index 54%
rename from sycl/plugins/unified_runtime/ur.hpp
rename to sycl/plugins/unified_runtime/ur/ur.hpp
index 63b4a54bb1ffd..89bd0551d65de
--- a/sycl/plugins/unified_runtime/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -7,9 +7,12 @@
 //===-----------------------------------------------------------------===//
 #pragma once
 
+#include <atomic>
 #include <iostream>
 #include <mutex>
 #include <shared_mutex>
+#include <thread>
+#include <vector>
 
 #include <zer_api.h>
 
@@ -32,15 +35,14 @@ static const bool SingleThreadMode = [] {
 // If SingleThreadMode variable is set then mutex operations are turned into
 // nop.
 class pi_shared_mutex {
-    std::shared_mutex Mutex;
+  std::shared_mutex Mutex;
+
 public:
   void lock() {
     if (!SingleThreadMode)
       Mutex.lock();
   }
-  bool try_lock() {
-    return SingleThreadMode ? true : Mutex.try_lock();
-  }
+  bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); }
   void unlock() {
     if (!SingleThreadMode)
       Mutex.unlock();
@@ -64,6 +66,7 @@ class pi_shared_mutex {
 // nop.
 class pi_mutex {
   std::mutex Mutex;
+
 public:
   void lock() {
     if (!SingleThreadMode)
@@ -76,5 +79,43 @@ class pi_mutex {
   }
 };
 
+/// SpinLock is a synchronization primitive, that uses atomic variable and
+/// causes thread trying acquire lock wait in loop while repeatedly check if
+/// the lock is available.
+///
+/// One important feature of this implementation is that std::atomic<bool> can
+/// be zero-initialized. This allows SpinLock to have trivial constructor and
+/// destructor, which makes it possible to use it in global context (unlike
+/// std::mutex, that doesn't provide such guarantees).
+class SpinLock {
+public:
+  void lock() {
+    while (MLock.test_and_set(std::memory_order_acquire))
+      std::this_thread::yield();
+  }
+  void unlock() { MLock.clear(std::memory_order_release); }
+
+private:
+  std::atomic_flag MLock = ATOMIC_FLAG_INIT;
+};
+
+// Helper for one-liner validation
+#define PI_ASSERT(condition, error)                                            \
+  if (!(condition))                                                            \
+    return error;
+
 // TODO: populate with target agnostic handling of UR platforms
 struct _ur_platform {};
+
+// Controls tracing UR calls from within the UR itself.
+extern bool PrintTrace;
+
+// Apparatus for maintaining immutable cache of platforms.
+//
+// Note we only create a simple pointer variables such that C++ RT won't
+// deallocate them automatically at the end of the main program.
+// The heap memory allocated for these global variables reclaimed only at
+// explicit tear-down.
+extern std::vector<zer_platform_handle_t> *PiPlatformsCache;
+extern SpinLock *PiPlatformsCacheMutex;
+extern bool PiPlatformCachePopulated;
diff --git a/sycl/plugins/unified_runtime/ur_bindings.hpp b/sycl/plugins/unified_runtime/ur_bindings.hpp
new file mode 100644
index 0000000000000..f09007158edb3
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur_bindings.hpp
@@ -0,0 +1,17 @@
+//===------ ur_bindings.hpp - Complete definitions of UR handles -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+#pragma once
+
+#include <ur/adapters/level_zero/ur_level_zero.hpp>
+#include <zer_api.h>
+
+// Make the Unified Runtime handles definition complete.
+// This is used in various "create" API where new handles are allocated.
+struct _zer_platform_handle_t : public _ur_platform_handle_t {
+  using _ur_platform_handle_t::_ur_platform_handle_t;
+};
diff --git a/sycl/test/basic_tests/interop-backend-traits.cpp b/sycl/test/basic_tests/interop-backend-traits.cpp
index 6414e218415c9..13ba493af05af 100644
--- a/sycl/test/basic_tests/interop-backend-traits.cpp
+++ b/sycl/test/basic_tests/interop-backend-traits.cpp
@@ -1,5 +1,5 @@
 // RUN: %clangxx -fsycl -DUSE_OPENCL %s
-// RUN: %clangxx -fsycl -DUSE_L0 %s
+// RUN: %clangxx %fsycl-host-only -DUSE_L0 %s
 // RUN: %clangxx -fsycl -DUSE_CUDA %s
 // RUN: %clangxx -fsycl -DUSE_HIP %s
 // RUN: %clangxx -fsycl -DUSE_CUDA_EXPERIMENTAL %s
@@ -13,7 +13,7 @@ constexpr auto Backend = sycl::backend::opencl;
 #endif
 
 #ifdef USE_L0
-#include <level_zero/ze_api.h>
+#include <ze_api.h>
 
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
 
diff --git a/sycl/test/basic_tests/interop-level-zero-2020.cpp b/sycl/test/basic_tests/interop-level-zero-2020.cpp
index 6274786bea46b..e73133850c823 100644
--- a/sycl/test/basic_tests/interop-level-zero-2020.cpp
+++ b/sycl/test/basic_tests/interop-level-zero-2020.cpp
@@ -4,7 +4,7 @@
 // Test for SYCL-2020 Level Zero interop API
 
 // clang-format off
-#include <level_zero/ze_api.h>
+#include <ze_api.h>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
 // clang-format on
 
diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py
index cde5d4e90b319..c5d217838f1f0 100644
--- a/sycl/test/lit.cfg.py
+++ b/sycl/test/lit.cfg.py
@@ -80,13 +80,14 @@
 config.substitutions.append( ('%sycl_include',  config.sycl_include ) )
 config.substitutions.append( ('%sycl_source_dir', config.sycl_source_dir) )
 config.substitutions.append( ('%opencl_libs_dir',  config.opencl_libs_dir) )
+config.substitutions.append( ('%level_zero_include_dir',  config.level_zero_include_dir) )
 config.substitutions.append( ('%opencl_include_dir',  config.opencl_include_dir) )
 config.substitutions.append( ('%cuda_toolkit_include',  config.cuda_toolkit_include) )
 config.substitutions.append( ('%sycl_tools_src_dir',  config.sycl_tools_src_dir ) )
 config.substitutions.append( ('%llvm_build_lib_dir',  config.llvm_build_lib_dir ) )
 config.substitutions.append( ('%llvm_build_bin_dir',  config.llvm_build_bin_dir ) )
 
-config.substitutions.append( ('%fsycl-host-only', '-std=c++17 -Xclang -fsycl-is-host -isystem %s -isystem %s -isystem %s' % (config.sycl_include, config.opencl_include_dir, config.sycl_include + '/sycl/') ) )
+config.substitutions.append( ('%fsycl-host-only', '-std=c++17 -Xclang -fsycl-is-host -isystem %s -isystem %s -isystem %s -isystem %s' % (config.sycl_include, config.level_zero_include_dir, config.opencl_include_dir, config.sycl_include + '/sycl/') ) )
 config.substitutions.append( ('%sycl_lib', ' -lsycl6' if platform.system() == "Windows" else '-lsycl') )
 
 llvm_config.add_tool_substitutions(['llvm-spirv'], [config.sycl_tools_dir])
diff --git a/sycl/test/lit.site.cfg.py.in b/sycl/test/lit.site.cfg.py.in
index 410bd18d87789..d5a6317e2e23c 100644
--- a/sycl/test/lit.site.cfg.py.in
+++ b/sycl/test/lit.site.cfg.py.in
@@ -12,6 +12,7 @@ config.sycl_libs_dir = lit_config.params.get('SYCL_LIBS_DIR', "@LLVM_LIBS_DIR@")
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.host_triple = "@LLVM_HOST_TRIPLE@"
 config.opencl_libs_dir = os.path.dirname("@OpenCL_LIBRARIES@")
+config.level_zero_include_dir = "@LEVEL_ZERO_INCLUDE_DIR@"
 config.opencl_include_dir = "@OpenCL_INCLUDE_DIR@"
 config.cuda_toolkit_include = "@CUDA_TOOLKIT_INCLUDE@"
 config.sycl_tools_src_dir = "@SYCL_TOOLS_SRC_DIR@"
diff --git a/sycl/tools/sycl-trace/CMakeLists.txt b/sycl/tools/sycl-trace/CMakeLists.txt
index 6f36b83a29391..8859c42a0ba8d 100644
--- a/sycl/tools/sycl-trace/CMakeLists.txt
+++ b/sycl/tools/sycl-trace/CMakeLists.txt
@@ -45,13 +45,13 @@ add_custom_target(pi-pretty-printers
 if ("level_zero" IN_LIST SYCL_ENABLE_PLUGINS)
   add_dependencies(sycl_pi_trace_collector pi_level_zero)
 
-  target_link_libraries(sycl_pi_trace_collector PRIVATE LevelZeroLoader::Headers)
+  target_link_libraries(sycl_pi_trace_collector PRIVATE LevelZeroLoader-Headers)
   target_compile_definitions(sycl_pi_trace_collector PRIVATE SYCL_HAS_LEVEL_ZERO)
 
   add_custom_target(ze-pretty-printers
     COMMAND ${Python3_EXECUTABLE}
     ${CMAKE_CURRENT_SOURCE_DIR}/generate_ze_pretty_printers.py
-    ${SYCL_INCLUDE_BUILD_DIR}/sycl/level_zero/ze_api.h
+    ${LEVEL_ZERO_INCLUDE_DIR}/ze_api.h
     DEPENDS pi_level_zero
     BYPRODUCTS
       ${CMAKE_CURRENT_BINARY_DIR}/ze_printers.def
diff --git a/sycl/tools/sycl-trace/ze_trace_collector.cpp b/sycl/tools/sycl-trace/ze_trace_collector.cpp
index d79e48e2874ab..fb2bf2c3d2a0e 100644
--- a/sycl/tools/sycl-trace/ze_trace_collector.cpp
+++ b/sycl/tools/sycl-trace/ze_trace_collector.cpp
@@ -13,7 +13,7 @@
 
 #include <sycl/detail/spinlock.hpp>
 
-#include <level_zero/zet_api.h>
+#include <zet_api.h>
 
 #include <iostream>
 #include <mutex>

From d47b99e327774b37744eac333369a53102e610f0 Mon Sep 17 00:00:00 2001
From: Kseniya Tikhomirova <kseniya.tikhomirova@intel.com>
Date: Fri, 16 Dec 2022 11:34:06 +0100
Subject: [PATCH 02/11] =?UTF-8?q?[SYCL]=20Win:=20do=20not=20cleanup=20sche?=
 =?UTF-8?q?duler=20resources=20due=20to=20unpredictable=20s=E2=80=A6=20(#7?=
 =?UTF-8?q?801)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Windows specific.
Do not try to release any resources during program exit. It may cause
unpredictable results.
The main root cause is threads. Per my observations and according to
windows ExitProcess docs threads are killed before libraries unload. So
at the time when we call shutdown - thread pool threads and any other
user threads will already be killed. What it means to us: we could not
know exactly the state of jobs and objects they was working with. For
example graph mutex could be locked by thread executing by host task, or
container state could be undefined if killed thread was working with it,
or additional user thread could call sycl API and some resources of sycl
dependencies could be also affected.

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/scheduler/scheduler.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index d5a0a5a382e6c..171cd0aae5eee 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -365,6 +365,7 @@ Scheduler::Scheduler() {
 Scheduler::~Scheduler() { DefaultHostQueue.reset(); }
 
 void Scheduler::releaseResources() {
+#ifndef _WIN32
   if (DefaultHostQueue) {
     DefaultHostQueue->wait();
   }
@@ -385,6 +386,7 @@ void Scheduler::releaseResources() {
   // added to deferred mem obj storage. So we may end up with leak.
   while (!isDeferredMemObjectsEmpty())
     cleanupDeferredMemObjects(BlockingT::BLOCKING);
+#endif
 }
 
 MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) {

From 3c8cf0b013f4908ad31a931d984c448c58b74d9f Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Fri, 16 Dec 2022 13:58:31 +0100
Subject: [PATCH 03/11] [SYCL] Use decorated pointer in device_global (#7796)

device_global without device_image_scope currently use an undecorated
pointer for its underlying type, but we know that the pointer will be to
the global memory space. This commit changes the underlying member for
device_global with device_image_scope to be a decorated pointer and
changes the get_multi_ptr member function to create the multi_ptr
directly from the decorated pointer instead of performing unnecessary
address space casts.

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 .../oneapi/device_global/device_global.hpp    | 58 +++++++++++++------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/device_global/device_global.hpp b/sycl/include/sycl/ext/oneapi/device_global/device_global.hpp
index 6747c01066881..e0452e9d65e46 100644
--- a/sycl/include/sycl/ext/oneapi/device_global/device_global.hpp
+++ b/sycl/include/sycl/ext/oneapi/device_global/device_global.hpp
@@ -15,6 +15,7 @@
 #include <sycl/exception.hpp>
 #include <sycl/ext/oneapi/device_global/properties.hpp>
 #include <sycl/ext/oneapi/properties/properties.hpp>
+#include <sycl/pointers.hpp>
 
 #ifdef __SYCL_DEVICE_ONLY__
 #define __SYCL_HOST_NOT_SUPPORTED(Op)
@@ -42,9 +43,27 @@ struct HasArrowOperator<
 template <typename T, typename PropertyListT, typename = void>
 class device_global_base {
 protected:
-  T *usmptr;
-  T *get_ptr() noexcept { return usmptr; }
-  const T *get_ptr() const noexcept { return usmptr; }
+  using pointer_t = typename decorated_global_ptr<T>::pointer;
+  pointer_t usmptr;
+  pointer_t get_ptr() noexcept { return usmptr; }
+  const pointer_t get_ptr() const noexcept { return usmptr; }
+
+public:
+  template <access::decorated IsDecorated>
+  multi_ptr<T, access::address_space::global_space, IsDecorated>
+  get_multi_ptr() noexcept {
+    __SYCL_HOST_NOT_SUPPORTED("get_multi_ptr()")
+    return multi_ptr<T, access::address_space::global_space, IsDecorated>{
+        get_ptr()};
+  }
+
+  template <access::decorated IsDecorated>
+  multi_ptr<const T, access::address_space::global_space, IsDecorated>
+  get_multi_ptr() const noexcept {
+    __SYCL_HOST_NOT_SUPPORTED("get_multi_ptr()")
+    return multi_ptr<const T, access::address_space::global_space, IsDecorated>{
+        get_ptr()};
+  }
 };
 
 // Specialization of device_global base class for when device_image_scope is in
@@ -58,6 +77,23 @@ class device_global_base<
   T val{};
   T *get_ptr() noexcept { return &val; }
   const T *get_ptr() const noexcept { return &val; }
+
+public:
+  template <access::decorated IsDecorated>
+  multi_ptr<T, access::address_space::global_space, IsDecorated>
+  get_multi_ptr() noexcept {
+    __SYCL_HOST_NOT_SUPPORTED("get_multi_ptr()")
+    return address_space_cast<access::address_space::global_space, IsDecorated,
+                              T>(this->get_ptr());
+  }
+
+  template <access::decorated IsDecorated>
+  multi_ptr<const T, access::address_space::global_space, IsDecorated>
+  get_multi_ptr() const noexcept {
+    __SYCL_HOST_NOT_SUPPORTED("get_multi_ptr()")
+    return address_space_cast<access::address_space::global_space, IsDecorated,
+                              const T>(this->get_ptr());
+  }
 };
 } // namespace detail
 
@@ -113,22 +149,6 @@ class
   device_global &operator=(const device_global &) = delete;
   device_global &operator=(const device_global &&) = delete;
 
-  template <access::decorated IsDecorated>
-  multi_ptr<T, access::address_space::global_space, IsDecorated>
-  get_multi_ptr() noexcept {
-    __SYCL_HOST_NOT_SUPPORTED("get_multi_ptr()")
-    return address_space_cast<access::address_space::global_space, IsDecorated>(
-        this->get_ptr());
-  }
-
-  template <access::decorated IsDecorated>
-  multi_ptr<const T, access::address_space::global_space, IsDecorated>
-  get_multi_ptr() const noexcept {
-    __SYCL_HOST_NOT_SUPPORTED("get_multi_ptr()")
-    return address_space_cast<access::address_space::global_space, IsDecorated,
-                              const T>(this->get_ptr());
-  }
-
   T &get() noexcept {
     __SYCL_HOST_NOT_SUPPORTED("get()")
     return *this->get_ptr();

From f4a9ef1b78bc96949b5fbd72f45f115d48b9fcc4 Mon Sep 17 00:00:00 2001
From: yubingex007-a11y <bing1.yu@intel.com>
Date: Sat, 17 Dec 2022 00:18:09 +0800
Subject: [PATCH 04/11] [SYCL] Implement matrix extension using new unified
 interface (#7413)

---
 sycl/include/CL/__spirv/spirv_types.hpp       |   9 +
 .../sycl/ext/oneapi/matrix/matrix-intel.hpp   | 347 ++++++++++
 .../sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 643 ------------------
 .../ext/oneapi/matrix/matrix-tensorcores.hpp  |  25 +-
 .../oneapi/matrix/matrix-unified-utils.hpp    |  26 +
 .../sycl/ext/oneapi/matrix/matrix-unified.hpp | 162 ++++-
 .../include/sycl/ext/oneapi/matrix/matrix.hpp |   5 +-
 .../ext/oneapi/matrix/static-query-use.hpp    |  40 +-
 .../{ => legacy}/matrix-bf16-test-SG-16.cpp   |  69 +-
 .../matrix/{ => legacy}/matrix-bf16-test.cpp  |  64 +-
 .../matrix-bfloat16-test.cpp}                 |  18 +-
 .../matrix/legacy/matrix-elemwise-ops.cpp     | 175 +++++
 .../{ => legacy}/matrix-int8-test-SG-16.cpp   |  42 +-
 .../matrix-int8-test.cpp}                     |  21 +-
 sycl/test/matrix/matrix-bfloat16-test.cpp     |  24 +-
 sycl/test/matrix/matrix-elemwise-ops.cpp      |  25 +-
 sycl/test/matrix/matrix-int8-test.cpp         |  65 +-
 sycl/test/matrix/query-use.cpp                |  34 +-
 18 files changed, 931 insertions(+), 863 deletions(-)
 create mode 100644 sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
 delete mode 100644 sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp
 create mode 100644 sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp
 rename sycl/test/matrix/{ => legacy}/matrix-bf16-test-SG-16.cpp (76%)
 rename sycl/test/matrix/{ => legacy}/matrix-bf16-test.cpp (78%)
 rename sycl/test/matrix/{matrix-bfloat16-test-use.cpp => legacy/matrix-bfloat16-test.cpp} (91%)
 create mode 100644 sycl/test/matrix/legacy/matrix-elemwise-ops.cpp
 rename sycl/test/matrix/{ => legacy}/matrix-int8-test-SG-16.cpp (83%)
 rename sycl/test/matrix/{matrix-int8-test-use.cpp => legacy/matrix-int8-test.cpp} (88%)

diff --git a/sycl/include/CL/__spirv/spirv_types.hpp b/sycl/include/CL/__spirv/spirv_types.hpp
index befbcf095985c..c19f884c448ee 100644
--- a/sycl/include/CL/__spirv/spirv_types.hpp
+++ b/sycl/include/CL/__spirv/spirv_types.hpp
@@ -108,6 +108,14 @@ enum class GroupOperation : uint32_t {
   ExclusiveScan = 2
 };
 
+#if (SYCL_EXT_ONEAPI_MATRIX_VERSION > 1)
+enum class MatrixLayout : uint32_t {
+  RowMajor = 0,
+  ColumnMajor = 1,
+  Packed = 2,
+  Dynamic = 3
+};
+#else
 enum class MatrixLayout : uint32_t {
   RowMajor = 0,
   ColumnMajor = 1,
@@ -115,6 +123,7 @@ enum class MatrixLayout : uint32_t {
   PackedB = 3,
   Unused = 4
 };
+#endif
 
 enum class MatrixUse : uint32_t { MatrixA = 0, MatrixB = 1, Accumulator = 2 };
 
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
new file mode 100644
index 0000000000000..1adcde9054e46
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
@@ -0,0 +1,347 @@
+//==------------------ matrix-intel.hpp - SYCL matrix ----------*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+
+#pragma once
+
+#include "matrix-unified-utils.hpp"
+#include <CL/__spirv/spirv_ops.hpp>
+#include <sycl/detail/defines_elementary.hpp>
+#include <sycl/feature_test.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace ext {
+namespace intel::experimental::matrix::layout {
+constexpr sycl::ext::oneapi::experimental::matrix::layout packed =
+    static_cast<sycl::ext::oneapi::experimental::matrix::layout>(2);
+}
+namespace oneapi {
+namespace experimental {
+namespace matrix {
+
+template <layout Layout> struct spv_matrix_layout_traits {
+  static constexpr __spv::MatrixLayout value = __spv::MatrixLayout::Dynamic;
+};
+
+#define SPV_MATRIX_LAYOUT_TRAITS(LAYOUT, SPV_LAYOUT)                           \
+  template <> struct spv_matrix_layout_traits<LAYOUT> {                        \
+    static constexpr __spv::MatrixLayout value = SPV_LAYOUT;                   \
+  };
+
+SPV_MATRIX_LAYOUT_TRAITS(layout::row_major, __spv::MatrixLayout::RowMajor)
+SPV_MATRIX_LAYOUT_TRAITS(layout::col_major, __spv::MatrixLayout::ColumnMajor)
+SPV_MATRIX_LAYOUT_TRAITS(sycl::ext::intel::experimental::matrix::layout::packed,
+                         __spv::MatrixLayout::Packed)
+SPV_MATRIX_LAYOUT_TRAITS(layout::dynamic, __spv::MatrixLayout::Dynamic)
+
+template <use Use> struct spv_matrix_use_traits {
+  static constexpr __spv::MatrixUse value = __spv::MatrixUse::MatrixA;
+};
+
+#define SPV_MATRIX_USE_TRAITS(USE, SPV_USE)                                    \
+  template <> struct spv_matrix_use_traits<USE> {                              \
+    static constexpr __spv::MatrixUse value = SPV_USE;                         \
+  };
+
+SPV_MATRIX_USE_TRAITS(use::a, __spv::MatrixUse::MatrixA)
+SPV_MATRIX_USE_TRAITS(use::b, __spv::MatrixUse::MatrixB)
+SPV_MATRIX_USE_TRAITS(use::accumulator, __spv::MatrixUse::Accumulator)
+
+template <typename G> struct spv_scope_traits {};
+template <> struct spv_scope_traits<sycl::sub_group> {
+  constexpr static auto value = __spv::Scope::Subgroup;
+};
+template <int D> struct spv_scope_traits<sycl::group<D>> {
+  constexpr static auto value = __spv::Scope::Workgroup;
+};
+
+// forward declarations
+template <typename Group, typename T, use Use, size_t Rows, size_t Cols,
+          layout Layout>
+struct joint_matrix;
+
+template <typename T, size_t NumRows, size_t NumCols, use Use,
+          layout Layout = layout::dynamic, typename Group = sycl::sub_group>
+class wi_element {
+  joint_matrix<Group, T, Use, NumRows, NumCols, Layout> &M;
+  std::size_t idx;
+
+public:
+  wi_element(joint_matrix<Group, T, Use, NumRows, NumCols, Layout> &Mat,
+             std::size_t i)
+      : M(Mat), idx(i) {}
+  operator T() {
+#ifdef __SYCL_DEVICE_ONLY__
+    return __spirv_VectorExtractDynamic(M.spvm, idx);
+#else
+    throw runtime_error("joint matrix is not supported on host device.",
+                        PI_ERROR_INVALID_DEVICE);
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
+  explicit operator bool() {
+#ifdef __SYCL_DEVICE_ONLY__
+    return __spirv_VectorExtractDynamic(M.spvm, idx) != static_cast<T>(0);
+#else
+    throw runtime_error("joint matrix is not supported on host device.",
+                        PI_ERROR_INVALID_DEVICE);
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
+  template <typename T2> wi_element &operator=(const T2 &rhs) {
+#ifdef __SYCL_DEVICE_ONLY__
+    M.spvm = __spirv_VectorInsertDynamic(M.spvm, static_cast<T>(rhs), idx);
+    return *this;
+#else
+    (void)rhs;
+    throw runtime_error("joint matrix is not supported on host device.",
+                        PI_ERROR_INVALID_DEVICE);
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
+  wi_element &
+  operator=(const wi_element<T, NumRows, NumCols, Use, Layout, Group> &rhs) {
+#ifdef __SYCL_DEVICE_ONLY__
+    M.spvm = __spirv_VectorInsertDynamic(
+        M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx);
+    return *this;
+#else
+    (void)rhs;
+    throw runtime_error("joint matrix is not supported on host device.",
+                        PI_ERROR_INVALID_DEVICE);
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
+#if __SYCL_DEVICE_ONLY__
+#define OP(op)                                                                 \
+  template <typename T2> wi_element &operator op##=(const T2 &rhs) {           \
+    M.spvm = __spirv_VectorInsertDynamic(                                      \
+        M.spvm,                                                                \
+        static_cast<T>(__spirv_VectorExtractDynamic(M.spvm, idx)               \
+                           op static_cast<T>(rhs)),                            \
+        idx);                                                                  \
+    return *this;                                                              \
+  }
+#else // __SYCL_DEVICE_ONLY__
+#define OP(op)                                                                 \
+  template <typename T2> wi_element &operator op##=(const T2 &rhs) {           \
+    (void)rhs;                                                                 \
+    throw runtime_error("joint matrix is not supported on host device.",       \
+                        PI_ERROR_INVALID_DEVICE);                              \
+  }
+#endif // __SYCL_DEVICE_ONLY__
+  OP(+)
+  OP(-)
+  OP(*)
+  OP(/)
+#undef OP
+};
+
+template <size_t NumRows, size_t NumCols, use Use, layout Layout,
+          typename Group>
+class wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use, Layout,
+                 Group> {
+  joint_matrix<Group, sycl::ext::oneapi::bfloat16, Use, NumRows, NumCols,
+               Layout> &M;
+  std::size_t idx;
+
+public:
+  wi_element(joint_matrix<Group, sycl::ext::oneapi::bfloat16, Use, NumRows,
+                          NumCols, Layout> &Mat,
+             std::size_t i)
+      : M(Mat), idx(i) {}
+  operator sycl::ext::oneapi::bfloat16() {
+#ifdef __SYCL_DEVICE_ONLY__
+    return __spirv_VectorExtractDynamic(M.spvm, idx);
+#else
+    throw runtime_error("joint matrix is not supported on host device.",
+                        PI_ERROR_INVALID_DEVICE);
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
+  explicit operator bool() {
+#ifdef __SYCL_DEVICE_ONLY__
+    return std::fabs(static_cast<float>(__spirv_VectorExtractDynamic(
+               M.spvm, idx))) >= std::numeric_limits<float>::epsilon();
+#else
+    throw runtime_error("joint matrix is not supported on host device.",
+                        PI_ERROR_INVALID_DEVICE);
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
+  wi_element &operator=(const sycl::ext::oneapi::bfloat16 &rhs) {
+#ifdef __SYCL_DEVICE_ONLY__
+    M.spvm = __spirv_VectorInsertDynamic(M.spvm, rhs, idx);
+    return *this;
+#else
+    (void)rhs;
+    throw runtime_error("joint matrix is not supported on host device.",
+                        PI_ERROR_INVALID_DEVICE);
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
+  wi_element &operator=(const wi_element<sycl::ext::oneapi::bfloat16, NumRows,
+                                         NumCols, Use, Layout, Group> &rhs) {
+#ifdef __SYCL_DEVICE_ONLY__
+    M.spvm = __spirv_VectorInsertDynamic(
+        M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx);
+    return *this;
+#else
+    (void)rhs;
+    throw runtime_error("joint matrix is not supported on host device.",
+                        PI_ERROR_INVALID_DEVICE);
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
+#if __SYCL_DEVICE_ONLY__
+#define OP(opassign, op)                                                       \
+  wi_element &operator opassign(const sycl::ext::oneapi::bfloat16 &rhs) {      \
+    M.spvm = __spirv_VectorInsertDynamic(                                      \
+        M.spvm, __spirv_VectorExtractDynamic(M.spvm, idx) op rhs, idx);        \
+    return *this;                                                              \
+  }
+#else // __SYCL_DEVICE_ONLY__
+#define OP(opassign, op)                                                       \
+  wi_element &operator opassign(const sycl::ext::oneapi::bfloat16 &rhs) {      \
+    (void)rhs;                                                                 \
+    throw runtime_error("joint matrix is not supported on host device.",       \
+                        PI_ERROR_INVALID_DEVICE);                              \
+  }
+#endif // __SYCL_DEVICE_ONLY__
+  OP(+=, +)
+  OP(-=, -)
+  OP(*=, *)
+  OP(/=, /)
+#undef OP
+
+#if __SYCL_DEVICE_ONLY__
+#define OP(type, op)                                                           \
+  friend type operator op(                                                     \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &lhs,                                    \
+      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
+    return __spirv_VectorExtractDynamic(lhs.M.spvm, lhs.idx) op rhs;           \
+  }                                                                            \
+  friend type operator op(                                                     \
+      const sycl::ext::oneapi::bfloat16 &lhs,                                  \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &rhs) {                                  \
+    return __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx) op lhs;           \
+  }
+  OP(sycl::ext::oneapi::bfloat16, +)
+  OP(sycl::ext::oneapi::bfloat16, -)
+  OP(sycl::ext::oneapi::bfloat16, *)
+  OP(sycl::ext::oneapi::bfloat16, /)
+#undef OP
+#define OP(type, op)                                                           \
+  friend type operator op(                                                     \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &lhs,                                    \
+      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
+    return type{static_cast<float>(__spirv_VectorExtractDynamic(               \
+        lhs.M.spvm, lhs.idx)) op static_cast<float>(rhs)};                     \
+  }                                                                            \
+  friend type operator op(                                                     \
+      const sycl::ext::oneapi::bfloat16 &lhs,                                  \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &rhs) {                                  \
+    return type{static_cast<float>(__spirv_VectorExtractDynamic(               \
+        rhs.M.spvm, rhs.idx)) op static_cast<float>(lhs)};                     \
+  }
+  OP(bool, ==)
+  OP(bool, !=)
+  OP(bool, <)
+  OP(bool, >)
+  OP(bool, <=)
+  OP(bool, >=)
+#undef OP
+#else // __SYCL_DEVICE_ONLY__
+#define OP(type, op)                                                           \
+  friend type operator op(                                                     \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &,                                       \
+      const sycl::ext::oneapi::bfloat16 &) {                                   \
+    throw runtime_error("joint matrix is not supported on host device.",       \
+                        PI_ERROR_INVALID_DEVICE);                              \
+  }                                                                            \
+  friend type operator op(                                                     \
+      const sycl::ext::oneapi::bfloat16 &,                                     \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &) {                                     \
+    throw runtime_error("joint matrix is not supported on host device.",       \
+                        PI_ERROR_INVALID_DEVICE);                              \
+  }
+  OP(sycl::ext::oneapi::bfloat16, +)
+  OP(sycl::ext::oneapi::bfloat16, -)
+  OP(sycl::ext::oneapi::bfloat16, *)
+  OP(sycl::ext::oneapi::bfloat16, /)
+  OP(bool, ==)
+  OP(bool, !=)
+  OP(bool, <)
+  OP(bool, >)
+  OP(bool, <=)
+  OP(bool, >=)
+#undef OP
+#endif // __SYCL_DEVICE_ONLY__
+};
+
+} // namespace matrix
+} // namespace experimental
+} // namespace oneapi
+
+namespace intel::experimental::matrix {
+template <
+    typename Group, typename T,
+    sycl::ext::oneapi::experimental::matrix::use Use, size_t NumRows,
+    size_t NumCols, sycl::ext::oneapi::experimental::matrix::layout Layout,
+    access::address_space Space, access::decorated IsDecorated,
+    std::enable_if_t<Use == sycl::ext::oneapi::experimental::matrix::use::a ||
+                         Use == sycl::ext::oneapi::experimental::matrix::use::b,
+                     bool> = true>
+inline __SYCL_ALWAYS_INLINE void
+joint_matrix_store(Group sg,
+                   sycl::ext::oneapi::experimental::matrix::joint_matrix<
+                       Group, T, Use, NumRows, NumCols, Layout> &src,
+                   multi_ptr<T, Space, IsDecorated> dst, size_t stride) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  std::ignore = sg;
+  std::ignore = src;
+  std::ignore = dst;
+  std::ignore = stride;
+  throw runtime_error(
+      "This version of the matrix extension is only currently supported on "
+      "intel devices",
+      PI_ERROR_INVALID_DEVICE);
+#else
+  // intel's impl
+  T *Ptr = dst.get();
+  __spirv_JointMatrixStoreINTEL<T, NumRows, NumCols,
+                                sycl::ext::oneapi::experimental::matrix::
+                                    spv_matrix_use_traits<Use>::value,
+                                sycl::ext::oneapi::experimental::matrix::
+                                    spv_matrix_layout_traits<Layout>::value>(
+      Ptr, src.spvm, stride,
+      sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits<
+          Layout>::value,
+      sycl::ext::oneapi::experimental::matrix::spv_scope_traits<Group>::value);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = sg;
+  std::ignore = src;
+  std::ignore = dst;
+  std::ignore = stride;
+  throw runtime_error("joint matrix is not supported on host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+} // namespace intel::experimental::matrix
+
+} // namespace ext
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp
deleted file mode 100644
index 12ce5653fe193..0000000000000
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp
+++ /dev/null
@@ -1,643 +0,0 @@
-//==------------------ matrix-jit-use.hpp - SYCL matrix ----------------*- C++
-//-*---==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// ===--------------------------------------------------------------------=== //
-
-#pragma once
-
-#include <CL/__spirv/spirv_ops.hpp>
-#include <sycl/detail/defines_elementary.hpp>
-#include <sycl/feature_test.hpp>
-
-namespace sycl {
-__SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace ext::oneapi::experimental::matrix {
-
-// packed_a and packed_b will be replaced by packed once the use implementation
-// is stable.
-enum class layout { row_major, col_major, packed_a, packed_b, unused };
-
-template <layout Layout> struct spv_matrix_layout_traits {
-  static constexpr __spv::MatrixLayout value = __spv::MatrixLayout::Unused;
-};
-
-#define SPV_MATRIX_LAYOUT_TRAITS(LAYOUT, SPV_LAYOUT)                           \
-  template <> struct spv_matrix_layout_traits<LAYOUT> {                        \
-    static constexpr __spv::MatrixLayout value = SPV_LAYOUT;                   \
-  };
-
-SPV_MATRIX_LAYOUT_TRAITS(layout::row_major, __spv::MatrixLayout::RowMajor)
-SPV_MATRIX_LAYOUT_TRAITS(layout::col_major, __spv::MatrixLayout::ColumnMajor)
-SPV_MATRIX_LAYOUT_TRAITS(layout::packed_a, __spv::MatrixLayout::PackedA)
-SPV_MATRIX_LAYOUT_TRAITS(layout::packed_b, __spv::MatrixLayout::PackedB)
-SPV_MATRIX_LAYOUT_TRAITS(layout::unused, __spv::MatrixLayout::Unused)
-
-enum class use { a, b, accumulator };
-
-template <use Use> struct spv_matrix_use_traits {
-  static constexpr __spv::MatrixUse value = __spv::MatrixUse::MatrixA;
-};
-
-#define SPV_MATRIX_USE_TRAITS(USE, SPV_USE)                                    \
-  template <> struct spv_matrix_use_traits<USE> {                              \
-    static constexpr __spv::MatrixUse value = SPV_USE;                         \
-  };
-
-SPV_MATRIX_USE_TRAITS(use::a, __spv::MatrixUse::MatrixA)
-SPV_MATRIX_USE_TRAITS(use::b, __spv::MatrixUse::MatrixB)
-SPV_MATRIX_USE_TRAITS(use::accumulator, __spv::MatrixUse::Accumulator)
-
-template <typename G> struct spv_scope_traits {};
-template <> struct spv_scope_traits<sycl::sub_group> {
-  constexpr static auto value = __spv::Scope::Subgroup;
-};
-template <int D> struct spv_scope_traits<sycl::group<D>> {
-  constexpr static auto value = __spv::Scope::Workgroup;
-};
-
-template <typename T, size_t NumRows, size_t NumCols, use Use,
-          layout Layout = layout::unused, typename Group = sycl::sub_group>
-class wi_data;
-template <typename T, size_t NumRows, size_t NumCols, use Use,
-          layout Layout = layout::unused, typename Group = sycl::sub_group>
-struct joint_matrix {
-public:
-  __spv::__spirv_JointMatrixINTEL<
-      T, NumRows, NumCols, spv_matrix_layout_traits<Layout>::value,
-      spv_scope_traits<Group>::value, spv_matrix_use_traits<Use>::value> *spvm;
-  joint_matrix(Group sg) {
-#ifndef __SYCL_DEVICE_ONLY__
-    (void)sg;
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-  inline __SYCL_ALWAYS_INLINE wi_data<T, NumRows, NumCols, Use, Layout, Group>
-  get_wi_data() {
-    return wi_data<T, NumRows, NumCols, Use, Layout, Group>(*this);
-  }
-};
-
-template <typename Group, typename T, size_t NumRows, size_t NumCols, use Use,
-          access::address_space Space, access::decorated IsDecorated>
-inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
-    Group sg,
-    joint_matrix<T, NumRows, NumCols, Use, layout::unused, Group> &res,
-    multi_ptr<T, Space, IsDecorated> src, size_t stride, layout MemL) {
-#ifdef __SYCL_DEVICE_ONLY__
-  T *Ptr = src.get();
-  switch (MemL) {
-  default:
-    assert(false && "Invalid Memory Layout!");
-  case layout::row_major:
-    res.spvm = __spirv_JointMatrixLoadINTEL<
-        T, NumRows, NumCols, spv_matrix_use_traits<Use>::value,
-        spv_matrix_layout_traits<layout::unused>::value>(
-        Ptr, stride, __spv::MatrixLayout::RowMajor,
-        spv_scope_traits<Group>::value);
-    break;
-  case layout::col_major:
-    res.spvm = __spirv_JointMatrixLoadINTEL<
-        T, NumRows, NumCols, spv_matrix_use_traits<Use>::value,
-        spv_matrix_layout_traits<layout::unused>::value>(
-        Ptr, stride, __spv::MatrixLayout::ColumnMajor,
-        spv_scope_traits<Group>::value);
-    break;
-  case layout::packed_a:
-    res.spvm = __spirv_JointMatrixLoadINTEL<
-        T, NumRows, NumCols, spv_matrix_use_traits<Use>::value,
-        spv_matrix_layout_traits<layout::unused>::value>(
-        Ptr, stride, __spv::MatrixLayout::PackedA,
-        spv_scope_traits<Group>::value);
-    break;
-  case layout::packed_b:
-    res.spvm = __spirv_JointMatrixLoadINTEL<
-        T, NumRows, NumCols, spv_matrix_use_traits<Use>::value,
-        spv_matrix_layout_traits<layout::unused>::value>(
-        Ptr, stride, __spv::MatrixLayout::PackedB,
-        spv_scope_traits<Group>::value);
-    break;
-  }
-#else
-  (void)sg;
-  (void)res;
-  (void)src;
-  (void)stride;
-  (void)MemL;
-  throw runtime_error("joint matrix is not supported on host device.",
-                      PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-}
-
-template <typename Group, typename T, size_t NumRows, size_t NumCols, use Use,
-          access::address_space Space, access::decorated IsDecorated>
-inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
-    Group sg,
-    joint_matrix<T, NumRows, NumCols, Use, layout::unused, Group> &src,
-    multi_ptr<T, Space, IsDecorated> res, size_t stride, layout MemL) {
-#ifdef __SYCL_DEVICE_ONLY__
-  T *Ptr = res.get();
-  switch (MemL) {
-  default:
-    assert(false && "Invalid Memory Layout!");
-  case layout::row_major:
-    __spirv_JointMatrixStoreINTEL<
-        T, NumRows, NumCols, spv_matrix_use_traits<Use>::value,
-        spv_matrix_layout_traits<layout::unused>::value>(
-        Ptr, src.spvm, stride, __spv::MatrixLayout::RowMajor,
-        spv_scope_traits<Group>::value);
-    break;
-  case layout::col_major:
-    __spirv_JointMatrixStoreINTEL<
-        T, NumRows, NumCols, spv_matrix_use_traits<Use>::value,
-        spv_matrix_layout_traits<layout::unused>::value>(
-        Ptr, src.spvm, stride, __spv::MatrixLayout::ColumnMajor,
-        spv_scope_traits<Group>::value);
-    break;
-  case layout::packed_a:
-    __spirv_JointMatrixStoreINTEL<
-        T, NumRows, NumCols, spv_matrix_use_traits<Use>::value,
-        spv_matrix_layout_traits<layout::unused>::value>(
-        Ptr, src.spvm, stride, __spv::MatrixLayout::PackedA,
-        spv_scope_traits<Group>::value);
-    break;
-  case layout::packed_b:
-    __spirv_JointMatrixStoreINTEL<
-        T, NumRows, NumCols, spv_matrix_use_traits<Use>::value,
-        spv_matrix_layout_traits<layout::unused>::value>(
-        Ptr, src.spvm, stride, __spv::MatrixLayout::PackedB,
-        spv_scope_traits<Group>::value);
-    break;
-  }
-#else
-  (void)sg;
-  (void)src;
-  (void)res;
-  (void)stride;
-  (void)MemL;
-  throw runtime_error("joint matrix is not supported on host device.",
-                      PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-}
-
-template <typename Group, typename T1, typename T2, typename T3, size_t M,
-          size_t K, size_t N>
-inline __SYCL_ALWAYS_INLINE
-    joint_matrix<T3, M, N, use::accumulator, layout::unused, Group>
-    joint_matrix_mad(
-        Group sg, joint_matrix<T1, M, K, use::a, layout::unused, Group> &mA,
-        joint_matrix<T2, K, N, use::b, layout::unused, Group> &mB,
-        joint_matrix<T3, M, N, use::accumulator, layout::unused, Group> &mC) {
-#ifdef __SYCL_DEVICE_ONLY__
-  joint_matrix<T3, M, N, use::accumulator, layout::unused, Group> res(sg);
-  if constexpr (std::is_same<T1, uint16_t>::value &&
-                std::is_same<T2, uint16_t>::value &&
-                std::is_same<T3, float>::value)
-    res.spvm = __spirv_JointMatrixMadINTEL(mA.spvm, mB.spvm, mC.spvm);
-  else if constexpr (std::is_unsigned<T1>::value && std::is_unsigned<T2>::value)
-    res.spvm = __spirv_JointMatrixUUMadINTEL(mA.spvm, mB.spvm, mC.spvm);
-  else if constexpr (std::is_signed<T1>::value && std::is_unsigned<T2>::value)
-    res.spvm = __spirv_JointMatrixSUMadINTEL(mA.spvm, mB.spvm, mC.spvm);
-  else if constexpr (std::is_unsigned<T1>::value && std::is_signed<T2>::value)
-    res.spvm = __spirv_JointMatrixUSMadINTEL(mA.spvm, mB.spvm, mC.spvm);
-  else
-    res.spvm = __spirv_JointMatrixMadINTEL(mA.spvm, mB.spvm, mC.spvm);
-  return res;
-#else
-  (void)sg;
-  (void)mA;
-  (void)mB;
-  (void)mC;
-  throw runtime_error("joint matrix is not supported on host device.",
-                      PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-}
-
-template <typename Group, typename T, size_t NumRows, size_t NumCols, use Use,
-          layout Layout, typename T2>
-inline __SYCL_ALWAYS_INLINE void
-joint_matrix_fill(Group sg,
-                  joint_matrix<T, NumRows, NumCols, Use, Layout, Group> &res,
-                  const T2 v) {
-  // We kept the unused "sg" in joint_matrix_fill to match the other DPC++
-  // functions
-  (void)sg;
-#ifdef __SYCL_DEVICE_ONLY__
-  res.spvm =
-      __spirv_CompositeConstruct<T, NumRows, NumCols,
-                                 spv_matrix_use_traits<Use>::value,
-                                 spv_matrix_layout_traits<Layout>::value>(
-          static_cast<T>(v));
-
-#else
-  (void)res;
-  (void)v;
-#endif // __SYCL_DEVICE_ONLY__
-}
-
-template <typename T, size_t NumRows, size_t NumCols, use Use,
-          layout Layout = layout::unused, typename Group = sycl::sub_group>
-class wi_element {
-  joint_matrix<T, NumRows, NumCols, Use, Layout, Group> &M;
-  std::size_t idx;
-
-public:
-  wi_element(joint_matrix<T, NumRows, NumCols, Use, Layout, Group> &Mat,
-             std::size_t i)
-      : M(Mat), idx(i) {}
-  operator T() {
-#ifdef __SYCL_DEVICE_ONLY__
-    return __spirv_VectorExtractDynamic(M.spvm, idx);
-#else
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-  explicit operator bool() {
-#ifdef __SYCL_DEVICE_ONLY__
-    return __spirv_VectorExtractDynamic(M.spvm, idx) != static_cast<T>(0);
-#else
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-  template <typename T2> wi_element &operator=(const T2 &rhs) {
-#ifdef __SYCL_DEVICE_ONLY__
-    M.spvm = __spirv_VectorInsertDynamic(M.spvm, static_cast<T>(rhs), idx);
-    return *this;
-#else
-    (void)rhs;
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-  wi_element &
-  operator=(const wi_element<T, NumRows, NumCols, Use, Layout, Group> &rhs) {
-#ifdef __SYCL_DEVICE_ONLY__
-    M.spvm = __spirv_VectorInsertDynamic(
-        M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx);
-    return *this;
-#else
-    (void)rhs;
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-#if __SYCL_DEVICE_ONLY__
-#define OP(op)                                                                 \
-  template <typename T2> wi_element &operator op##=(const T2 &rhs) {           \
-    M.spvm = __spirv_VectorInsertDynamic(                                      \
-        M.spvm,                                                                \
-        static_cast<T>(__spirv_VectorExtractDynamic(M.spvm, idx)               \
-                           op static_cast<T>(rhs)),                            \
-        idx);                                                                  \
-    return *this;                                                              \
-  }
-#else // __SYCL_DEVICE_ONLY__
-#define OP(op)                                                                 \
-  template <typename T2> wi_element &operator op##=(const T2 &rhs) {           \
-    (void)rhs;                                                                 \
-    throw runtime_error("joint matrix is not supported on host device.",       \
-                        PI_ERROR_INVALID_DEVICE);                              \
-  }
-#endif // __SYCL_DEVICE_ONLY__
-  OP(+)
-  OP(-)
-  OP(*)
-  OP(/)
-#undef OP
-};
-
-// Note that similarly to the other matrix functions, uint16_t is used here to
-// represent bf16 type. Since the AMX and DPAS implementations don't support
-// uint16_t, this interpretation is possible. This design choice was made before
-// the introduction of SYCL experimental bfloat16 type. Our plan is to move
-// towards using the SYCL bfloat16.
-template <size_t NumRows, size_t NumCols, use Use, layout Layout,
-          typename Group>
-class wi_element<uint16_t, NumRows, NumCols, Use, Layout, Group> {
-  joint_matrix<uint16_t, NumRows, NumCols, Use, Layout, Group> &M;
-  std::size_t idx;
-
-public:
-  wi_element(joint_matrix<uint16_t, NumRows, NumCols, Use, Layout, Group> &Mat,
-             std::size_t i)
-      : M(Mat), idx(i) {}
-  operator uint16_t() {
-#ifdef __SYCL_DEVICE_ONLY__
-    return __spirv_VectorExtractDynamic(M.spvm, idx);
-#else
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-  explicit operator bool() {
-#ifdef __SYCL_DEVICE_ONLY__
-    return std::fabs(make_fp32(__spirv_VectorExtractDynamic(M.spvm, idx))) >=
-           std::numeric_limits<float>::epsilon();
-#else
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-  wi_element &operator=(const uint16_t &rhs) {
-#ifdef __SYCL_DEVICE_ONLY__
-    M.spvm = __spirv_VectorInsertDynamic(M.spvm, rhs, idx);
-    return *this;
-#else
-    (void)rhs;
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-  wi_element &operator=(
-      const wi_element<uint16_t, NumRows, NumCols, Use, Layout, Group> &rhs) {
-#ifdef __SYCL_DEVICE_ONLY__
-    M.spvm = __spirv_VectorInsertDynamic(
-        M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx);
-    return *this;
-#else
-    (void)rhs;
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-  // We use here the following functions for conversion (bf16=>fp32 and
-  // fp32=>bf16). This is a workaround until we are able to use
-  // __spirv_ConvertFToBF16INTEL and __spirv_ConvertBF16ToFINTEL once these are
-  // supported in the CPU backend
-  static float make_fp32(uint16_t x) {
-    unsigned int y = x;
-    y = y << 16;
-    float *res = reinterpret_cast<float *>(&y);
-    return *res;
-  }
-
-  static uint16_t make_bf16(float x) {
-    int *res = reinterpret_cast<int *>(&x);
-    *res = *res >> 16;
-    return (uint16_t)*res;
-  }
-
-#if __SYCL_DEVICE_ONLY__
-#define OP(op)                                                                 \
-  wi_element &operator op##=(const uint16_t &rhs) {                            \
-    M.spvm = __spirv_VectorInsertDynamic(                                      \
-        M.spvm,                                                                \
-        make_bf16(make_fp32(__spirv_VectorExtractDynamic(M.spvm, idx)          \
-                                op make_fp32(rhs))),                           \
-        idx);                                                                  \
-    return *this;                                                              \
-  }
-#else // __SYCL_DEVICE_ONLY__
-#define OP(op)                                                                 \
-  wi_element &operator op##=(const uint16_t &rhs) {                            \
-    (void)rhs;                                                                 \
-    throw runtime_error("joint matrix is not supported on host device.",       \
-                        PI_ERROR_INVALID_DEVICE);                              \
-  }
-#endif // __SYCL_DEVICE_ONLY__
-  OP(+)
-  OP(-)
-  OP(*)
-  OP(/)
-#undef OP
-
-  template <typename T1, typename T2> struct Converter {
-    static T2 convert(const T1 &from) { return static_cast<T2>(from); }
-  };
-
-  template <typename T> struct Converter<T, uint16_t> {
-    static uint16_t convert(const T &from) { return make_bf16(from); }
-  };
-#if __SYCL_DEVICE_ONLY__
-#define OP(input_type, type, op)                                               \
-  friend type operator op(                                                     \
-      const wi_element<uint16_t, NumRows, NumCols, Use, Layout, Group> &lhs,   \
-      const uint16_t &rhs) {                                                   \
-    return Converter<input_type, type>::convert(make_fp32(                     \
-        __spirv_VectorExtractDynamic(lhs.M.spvm, lhs.idx)) op make_fp32(rhs)); \
-  }                                                                            \
-  friend type operator op(                                                     \
-      const uint16_t &lhs,                                                     \
-      const wi_element<uint16_t, NumRows, NumCols, Use, Layout, Group> &rhs) { \
-    return Converter<input_type, type>::convert(make_fp32(                     \
-        __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx)) op make_fp32(lhs)); \
-  }
-#else // __SYCL_DEVICE_ONLY__
-#define OP(input_type, type, op)                                               \
-  friend type operator op(                                                     \
-      const wi_element<uint16_t, NumRows, NumCols, Use, Layout, Group> &lhs,   \
-      const uint16_t &rhs) {                                                   \
-    (void)lhs;                                                                 \
-    (void)rhs;                                                                 \
-    throw runtime_error("joint matrix is not supported on host device.",       \
-                        PI_ERROR_INVALID_DEVICE);                              \
-  }                                                                            \
-  friend type operator op(                                                     \
-      const uint16_t &lhs,                                                     \
-      const wi_element<uint16_t, NumRows, NumCols, Use, Layout, Group> &rhs) { \
-    (void)lhs;                                                                 \
-    (void)rhs;                                                                 \
-    throw runtime_error("joint matrix is not supported on host device.",       \
-                        PI_ERROR_INVALID_DEVICE);                              \
-  }
-#endif // __SYCL_DEVICE_ONLY__
-  OP(float, uint16_t, +)
-  OP(float, uint16_t, -)
-  OP(float, uint16_t, *)
-  OP(float, uint16_t, /)
-  OP(bool, bool, ==)
-  OP(bool, bool, !=)
-  OP(bool, bool, <)
-  OP(bool, bool, >)
-  OP(bool, bool, <=)
-  OP(bool, bool, >=)
-#undef OP
-};
-
-template <size_t NumRows, size_t NumCols, use Use, layout Layout,
-          typename Group>
-class wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use, Layout,
-                 Group> {
-  joint_matrix<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use, Layout,
-               Group> &M;
-  std::size_t idx;
-
-public:
-  wi_element(joint_matrix<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,
-                          Layout, Group> &Mat,
-             std::size_t i)
-      : M(Mat), idx(i) {}
-  operator sycl::ext::oneapi::bfloat16() {
-#ifdef __SYCL_DEVICE_ONLY__
-    return __spirv_VectorExtractDynamic(M.spvm, idx);
-#else
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-  explicit operator bool() {
-#ifdef __SYCL_DEVICE_ONLY__
-    return std::fabs(static_cast<float>(__spirv_VectorExtractDynamic(
-               M.spvm, idx))) >= std::numeric_limits<float>::epsilon();
-#else
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-  wi_element &operator=(const sycl::ext::oneapi::bfloat16 &rhs) {
-#ifdef __SYCL_DEVICE_ONLY__
-    M.spvm = __spirv_VectorInsertDynamic(M.spvm, rhs, idx);
-    return *this;
-#else
-    (void)rhs;
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-  wi_element &operator=(const wi_element<sycl::ext::oneapi::bfloat16, NumRows,
-                                         NumCols, Use, Layout, Group> &rhs) {
-#ifdef __SYCL_DEVICE_ONLY__
-    M.spvm = __spirv_VectorInsertDynamic(
-        M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx);
-    return *this;
-#else
-    (void)rhs;
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-
-#if __SYCL_DEVICE_ONLY__
-#define OP(opassign, op)                                                       \
-  wi_element &operator opassign(const sycl::ext::oneapi::bfloat16 &rhs) {      \
-    M.spvm = __spirv_VectorInsertDynamic(                                      \
-        M.spvm, __spirv_VectorExtractDynamic(M.spvm, idx) op rhs, idx);        \
-    return *this;                                                              \
-  }
-#else // __SYCL_DEVICE_ONLY__
-#define OP(opassign, op)                                                       \
-  wi_element &operator opassign(const sycl::ext::oneapi::bfloat16 &rhs) {      \
-    (void)rhs;                                                                 \
-    throw runtime_error("joint matrix is not supported on host device.",       \
-                        PI_ERROR_INVALID_DEVICE);                              \
-  }
-#endif // __SYCL_DEVICE_ONLY__
-  OP(+=, +)
-  OP(-=, -)
-  OP(*=, *)
-  OP(/=, /)
-#undef OP
-
-#if __SYCL_DEVICE_ONLY__
-#define OP(type, op)                                                           \
-  friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
-                       Layout, Group> &lhs,                                    \
-      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
-    return __spirv_VectorExtractDynamic(lhs.M.spvm, lhs.idx) op rhs;           \
-  }                                                                            \
-  friend type operator op(                                                     \
-      const sycl::ext::oneapi::bfloat16 &lhs,                                  \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
-                       Layout, Group> &rhs) {                                  \
-    return __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx) op lhs;           \
-  }
-  OP(sycl::ext::oneapi::bfloat16, +)
-  OP(sycl::ext::oneapi::bfloat16, -)
-  OP(sycl::ext::oneapi::bfloat16, *)
-  OP(sycl::ext::oneapi::bfloat16, /)
-#undef OP
-#define OP(type, op)                                                           \
-  friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
-                       Layout, Group> &lhs,                                    \
-      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
-    return type{static_cast<float>(__spirv_VectorExtractDynamic(               \
-        lhs.M.spvm, lhs.idx)) op static_cast<float>(rhs)};                     \
-  }                                                                            \
-  friend type operator op(                                                     \
-      const sycl::ext::oneapi::bfloat16 &lhs,                                  \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
-                       Layout, Group> &rhs) {                                  \
-    return type{static_cast<float>(__spirv_VectorExtractDynamic(               \
-        rhs.M.spvm, rhs.idx)) op static_cast<float>(lhs)};                     \
-  }
-  OP(bool, ==)
-  OP(bool, !=)
-  OP(bool, <)
-  OP(bool, >)
-  OP(bool, <=)
-  OP(bool, >=)
-#undef OP
-#else // __SYCL_DEVICE_ONLY__
-#define OP(type, op)                                                           \
-  friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
-                       Layout, Group> &,                                       \
-      const sycl::ext::oneapi::bfloat16 &) {                                   \
-    throw runtime_error("joint matrix is not supported on host device.",       \
-                        PI_ERROR_INVALID_DEVICE);                              \
-  }                                                                            \
-  friend type operator op(                                                     \
-      const sycl::ext::oneapi::bfloat16 &,                                     \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
-                       Layout, Group> &) {                                     \
-    throw runtime_error("joint matrix is not supported on host device.",       \
-                        PI_ERROR_INVALID_DEVICE);                              \
-  }
-  OP(sycl::ext::oneapi::bfloat16, +)
-  OP(sycl::ext::oneapi::bfloat16, -)
-  OP(sycl::ext::oneapi::bfloat16, *)
-  OP(sycl::ext::oneapi::bfloat16, /)
-  OP(bool, ==)
-  OP(bool, !=)
-  OP(bool, <)
-  OP(bool, >)
-  OP(bool, <=)
-  OP(bool, >=)
-#undef OP
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-template <typename T, size_t NumRows, size_t NumCols, use Use, layout Layout,
-          typename Group>
-class wi_data {
-  joint_matrix<T, NumRows, NumCols, Use, Layout, Group> &M;
-
-public:
-  wi_data(joint_matrix<T, NumRows, NumCols, Use, Layout, Group> &Mat)
-      : M(Mat) {}
-  size_t length() {
-#ifdef __SYCL_DEVICE_ONLY__
-    return __spirv_JointMatrixWorkItemLengthINTEL(M.spvm);
-#else
-    throw runtime_error("joint matrix is not supported on host device.",
-                        PI_ERROR_INVALID_DEVICE);
-#endif // __SYCL_DEVICE_ONLY__
-  }
-  wi_element<T, NumRows, NumCols, Use, Layout, Group> operator[](size_t i) {
-    return wi_element<T, NumRows, NumCols, Use, Layout, Group>(M, i);
-  }
-};
-
-} // namespace ext::oneapi::experimental::matrix
-} // __SYCL_INLINE_VER_NAMESPACE(_V1)
-} // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp
index 5e9faba8e94ec..a871b9709ae66 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp
@@ -8,6 +8,7 @@
 // ===-------------------------------------------------------------------=== //
 
 #pragma once
+#include "matrix-unified-utils.hpp"
 #include <sycl/ext/oneapi/bfloat16.hpp>
 
 namespace sycl {
@@ -17,10 +18,6 @@ namespace oneapi {
 namespace experimental {
 namespace matrix {
 
-enum class use { a, b, accumulator };
-
-enum class layout { row_major, col_major, dynamic };
-
 namespace precision {
 class tf32 {
   tf32() = delete;
@@ -31,26 +28,6 @@ template <typename Group, typename T, use Use, size_t Rows, size_t Cols,
           layout Layout = layout::dynamic>
 struct joint_matrix;
 
-template <typename Group, typename T, use Use, size_t Rows, size_t Cols,
-          layout Layout>
-class wi_data {
-
-  joint_matrix<Group, T, Use, Rows, Cols, Layout> &jm;
-
-  wi_data(joint_matrix<Group, T, Use, Rows, Cols, Layout> &_jm) : jm(_jm){};
-
-  template <typename Grp, typename Type, use UseJm, size_t NumRows,
-            size_t NumCols, layout LayoutJm>
-  friend wi_data<Grp, Type, UseJm, NumRows, NumCols, LayoutJm>
-  get_wi_data(Grp,
-              joint_matrix<Grp, Type, UseJm, NumRows, NumCols, LayoutJm> &);
-
-public:
-  size_t length() { return jm.cuda_impl.wi_marray.size(); };
-
-  decltype(auto) operator[](size_t i) { return (jm.cuda_impl.wi_marray[i]); };
-};
-
 } // namespace matrix
 } // namespace experimental
 
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp
new file mode 100644
index 0000000000000..718411b22ddbb
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp
@@ -0,0 +1,26 @@
+//===------- matrix-unified.hpp - SYCL matrix extension ----*- C++ -*------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+
+#pragma once
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+namespace matrix {
+
+enum class use { a, b, accumulator };
+
+enum class layout { row_major = 0, col_major = 1, dynamic = 3 };
+
+} // namespace matrix
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
index bbdbdfc2f71b5..f6ad151a7946a 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
@@ -7,8 +7,8 @@
 // ===--------------------------------------------------------------------=== //
 
 #pragma once
+#include "matrix-intel.hpp"
 #include <sycl/ext/oneapi/matrix/matrix-tensorcores.hpp>
-
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace ext {
@@ -20,13 +20,16 @@ template <typename Group, typename T, use Use, size_t Rows, size_t Cols,
           layout Layout>
 struct joint_matrix {
 
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
-  // TODO: Intel case here: we use the ext_oneapi_cuda case also for the host,
-  // because the Intel SPIRV functions will not be host compilable.
-#else
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
   sycl::ext::oneapi::detail::joint_matrix_cuda<T, Use, Rows, Cols, Layout>
       cuda_impl;
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
+#else
+  __spv::__spirv_JointMatrixINTEL<
+      T, Rows, Cols, spv_matrix_layout_traits<Layout>::value,
+      spv_scope_traits<Group>::value, spv_matrix_use_traits<Use>::value> *spvm;
+#endif // defined(__SYCL_DEVICE_ONLY__)
+#endif
 
   joint_matrix() {
 #ifndef __SYCL_DEVICE_ONLY__
@@ -36,17 +39,74 @@ struct joint_matrix {
   }
 };
 
+#ifdef __SYCL_DEVICE_ONLY__
 template <typename Group, typename T, use Use, size_t Rows, size_t Cols,
           layout Layout>
-inline __SYCL_ALWAYS_INLINE wi_data<Group, T, Use, Rows, Cols, Layout>
+class wi_data {
+
+  joint_matrix<Group, T, Use, Rows, Cols, Layout> &jm;
+
+  wi_data(joint_matrix<Group, T, Use, Rows, Cols, Layout> &_jm) : jm(_jm){};
+
+  template <typename Grp, typename Type, use UseJm, size_t NumRows,
+            size_t NumCols, layout LayoutJm>
+  friend decltype(auto)
+  get_wi_data(Grp,
+              joint_matrix<Grp, Type, UseJm, NumRows, NumCols, LayoutJm> &);
+
+public:
+  size_t length() {
+#if defined(__NVPTX__)
+    return jm.cuda_impl.wi_marray.size();
+#else
+    return __spirv_JointMatrixWorkItemLengthINTEL(jm.spvm);
+#endif
+  };
+
+  decltype(auto) operator[](size_t i) {
+#if defined(__NVPTX__)
+    return (jm.cuda_impl.wi_marray[i]);
+#else
+    return wi_element<T, Rows, Cols, Use, Layout, Group>(jm, i);
+#endif
+  };
+};
+#else
+template <typename type, size_t size> class wi_data {
+  marray<type, size> &data;
+  wi_data(marray<type, size> &wi_marray) : data(wi_marray){};
+  template <typename Grp, typename Type, use UseJm, size_t NumRows,
+            size_t NumCols, layout LayoutJm>
+  friend decltype(auto)
+  get_wi_data(Grp,
+              joint_matrix<Grp, Type, UseJm, NumRows, NumCols, LayoutJm> &);
+
+public:
+  size_t length() { return data.size(); };
+
+  type &operator[](size_t i) { return data[i]; };
+};
+#endif
+
+template <typename Group, typename T, use Use, size_t Rows, size_t Cols,
+          layout Layout>
+inline __SYCL_ALWAYS_INLINE decltype(auto)
 get_wi_data(Group sg, joint_matrix<Group, T, Use, Rows, Cols, Layout> &jm) {
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__)
   std::ignore = sg;
   return wi_data(jm);
 #else
-  // TODO add Intel impl.
+  return wi_data<Group, T, Use, Rows, Cols, Layout>(jm);
 #endif // defined(__NVPTX__)
+#else
+  if constexpr (std::is_same_v<T, precision::tf32>) {
+    marray<float, 1> unused{};
+    return wi_data<float, 1>(unused);
+  } else {
+    marray<T, 1> unused{};
+    return wi_data<T, 1>(unused);
+  }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 }
 
@@ -60,6 +120,12 @@ joint_matrix_fill(Group sg,
 #if defined(__NVPTX__)
   std::ignore = sg;
   res.cuda_impl.wi_marray = v;
+#else
+  res.spvm =
+      __spirv_CompositeConstruct<T, NumRows, NumCols,
+                                 spv_matrix_use_traits<Use>::value,
+                                 spv_matrix_layout_traits<Layout>::value>(
+          static_cast<T>(v));
 #endif // defined(__NVPTX__)
 #else
   std::ignore = sg;
@@ -88,6 +154,35 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
   std::ignore = sg;
   sycl::ext::oneapi::detail::load_accumulator_cuda(res.cuda_impl, src, stride,
                                                    Layout);
+#else
+  // intel's impl
+  // matL is determined by matrix.use?
+  T *Ptr = src.get();
+  switch (Layout) {
+  default:
+    assert(false && "Invalid Memory Layout!");
+  case layout::row_major:
+    res.spvm = __spirv_JointMatrixLoadINTEL<
+        T, NumRows, NumCols, spv_matrix_use_traits<use::accumulator>::value,
+        spv_matrix_layout_traits<layout::dynamic>::value>(
+        Ptr, stride, __spv::MatrixLayout::RowMajor,
+        spv_scope_traits<Group>::value);
+    break;
+  case layout::col_major:
+    res.spvm = __spirv_JointMatrixLoadINTEL<
+        T, NumRows, NumCols, spv_matrix_use_traits<use::accumulator>::value,
+        spv_matrix_layout_traits<layout::dynamic>::value>(
+        Ptr, stride, __spv::MatrixLayout::ColumnMajor,
+        spv_scope_traits<Group>::value);
+    break;
+  case sycl::ext::intel::experimental::matrix::layout::packed:
+    res.spvm = __spirv_JointMatrixLoadINTEL<
+        T, NumRows, NumCols, spv_matrix_use_traits<use::accumulator>::value,
+        spv_matrix_layout_traits<layout::dynamic>::value>(
+        Ptr, stride, __spv::MatrixLayout::Packed,
+        spv_scope_traits<Group>::value);
+    break;
+  }
 #endif // defined(__NVPTX__)
 #else
   std::ignore = sg;
@@ -119,6 +214,14 @@ joint_matrix_load(Group sg,
   sycl::ext::oneapi::detail::load_multiplicand_cuda<S, T, NumRows, NumCols, Use,
                                                     Layout, Space>(
       res.cuda_impl, src, stride);
+#else
+  T *Ptr = src.get();
+  res.spvm =
+      __spirv_JointMatrixLoadINTEL<T, NumRows, NumCols,
+                                   spv_matrix_use_traits<Use>::value,
+                                   spv_matrix_layout_traits<Layout>::value>(
+          Ptr, stride, spv_matrix_layout_traits<Layout>::value,
+          spv_scope_traits<Group>::value);
 #endif // defined(__NVPTX__)
 #else
   std::ignore = sg;
@@ -146,6 +249,34 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
   sycl::ext::oneapi::detail::joint_matrix_store_cuda<T, NumRows, NumCols,
                                                      Space>(src.cuda_impl, dst,
                                                             stride, Layout);
+#else
+  // intel's impl
+  T *Ptr = dst.get();
+  switch (Layout) {
+  default:
+    assert(false && "Invalid Memory Layout!");
+  case layout::row_major:
+    __spirv_JointMatrixStoreINTEL<
+        T, NumRows, NumCols, spv_matrix_use_traits<use::accumulator>::value,
+        spv_matrix_layout_traits<layout::dynamic>::value>(
+        Ptr, src.spvm, stride, __spv::MatrixLayout::RowMajor,
+        spv_scope_traits<Group>::value);
+    break;
+  case layout::col_major:
+    __spirv_JointMatrixStoreINTEL<
+        T, NumRows, NumCols, spv_matrix_use_traits<use::accumulator>::value,
+        spv_matrix_layout_traits<layout::dynamic>::value>(
+        Ptr, src.spvm, stride, __spv::MatrixLayout::ColumnMajor,
+        spv_scope_traits<Group>::value);
+    break;
+  case sycl::ext::intel::experimental::matrix::layout::packed:
+    __spirv_JointMatrixStoreINTEL<
+        T, NumRows, NumCols, spv_matrix_use_traits<use::accumulator>::value,
+        spv_matrix_layout_traits<layout::dynamic>::value>(
+        Ptr, src.spvm, stride, __spv::MatrixLayout::Packed,
+        spv_scope_traits<Group>::value);
+    break;
+  }
 #endif // defined(__NVPTX__)
 #else
   std::ignore = sg;
@@ -185,6 +316,21 @@ inline __SYCL_ALWAYS_INLINE
     assert(false && "Ta != Tb : In the CUDA backend joint_matrix_mad "
                     "requires that joint_matrix data types Ta and Tb match");
   }
+#else
+  joint_matrix<Group, Tc, use::accumulator, M, N, layout::dynamic> res;
+  if constexpr (std::is_same<Ta, uint16_t>::value &&
+                std::is_same<Tb, uint16_t>::value &&
+                std::is_same<Tc, float>::value)
+    res.spvm = __spirv_JointMatrixMadINTEL(A.spvm, B.spvm, C.spvm);
+  else if constexpr (std::is_unsigned<Ta>::value && std::is_unsigned<Tb>::value)
+    res.spvm = __spirv_JointMatrixUUMadINTEL(A.spvm, B.spvm, C.spvm);
+  else if constexpr (std::is_signed<Ta>::value && std::is_unsigned<Tb>::value)
+    res.spvm = __spirv_JointMatrixSUMadINTEL(A.spvm, B.spvm, C.spvm);
+  else if constexpr (std::is_unsigned<Ta>::value && std::is_signed<Tb>::value)
+    res.spvm = __spirv_JointMatrixUSMadINTEL(A.spvm, B.spvm, C.spvm);
+  else
+    res.spvm = __spirv_JointMatrixMadINTEL(A.spvm, B.spvm, C.spvm);
+  return res;
 #endif // defined(__NVPTX__)
 #else
   std::ignore = sg;
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix.hpp
index f3e4ab90dc758..04bf6477f0ddd 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix.hpp
@@ -22,13 +22,10 @@
 #include <sycl/ext/oneapi/matrix/matrix-jit.hpp>
 #include <sycl/ext/oneapi/matrix/static-query.hpp>
 #endif // SYCL_EXT_ONEAPI_MATRIX_VERSION
-#if (SYCL_EXT_ONEAPI_MATRIX_VERSION == 2)
-#include <sycl/ext/oneapi/matrix/matrix-jit-use.hpp>
-#include <sycl/ext/oneapi/matrix/static-query-use.hpp>
-#endif // SYCL_EXT_ONEAPI_MATRIX_VERSION
 #if (SYCL_EXT_ONEAPI_MATRIX_VERSION == 3)
 #include <sycl/ext/oneapi/matrix/matrix-tensorcores-legacy.hpp>
 #endif // SYCL_EXT_ONEAPI_MATRIX_VERSION
 #if (SYCL_EXT_ONEAPI_MATRIX_VERSION == 4)
 #include <sycl/ext/oneapi/matrix/matrix-unified.hpp>
+#include <sycl/ext/oneapi/matrix/static-query-use.hpp>
 #endif // SYCL_EXT_ONEAPI_MATRIX_VERSION
diff --git a/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp
index 81dae79927fd6..3e561c9067061 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp
@@ -147,13 +147,13 @@ struct tpu_params<tpu::amx, Ta, Tb, Tc, 0, 0, 0,
   static constexpr std::size_t N = 16;
   static constexpr std::size_t K = ((sizeof(Ta) == 1) ? 64 : 32);
 
-  template <typename Group>
-  using joint_matrix_a = joint_matrix<Ta, M, K, use::a, layout::unused, Group>;
-  template <typename Group>
-  using joint_matrix_b = joint_matrix<Tb, K, N, use::b, layout::unused, Group>;
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
   template <typename Group>
   using joint_matrix_accumulator =
-      joint_matrix<Tc, M, N, use::accumulator, layout::unused, Group>;
+      joint_matrix<Group, Tc, use::accumulator, M, N>;
 
   bool dynamic_p = false; // should be true in future implementations because
                           // AMX hardware supports dynamic sizes
@@ -199,13 +199,13 @@ struct tpu_params<
   static constexpr std::size_t K =
       (sK != 0) ? sK : ((sizeof(Ta) == 1) ? 64 : 32);
 
-  template <typename Group>
-  using joint_matrix_a = joint_matrix<Ta, M, K, use::a, layout::unused, Group>;
-  template <typename Group>
-  using joint_matrix_b = joint_matrix<Tb, K, N, use::b, layout::unused, Group>;
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
   template <typename Group>
   using joint_matrix_accumulator =
-      joint_matrix<Tc, M, N, use::accumulator, layout::unused, Group>;
+      joint_matrix<Group, Tc, use::accumulator, M, N>;
 
   bool dynamic_p = false; // should be true in future implementations
                           // because AMX hardware supports dynamic sizes
@@ -335,13 +335,13 @@ struct tpu_params<tpu::dpas, Ta, Tb, Tc, 0, 0, 0,
   static constexpr std::size_t N = 8;
   static constexpr std::size_t K = ((sizeof(Ta) == 1) ? 32 : 16);
 
-  template <typename Group>
-  using joint_matrix_a = joint_matrix<Ta, M, K, use::a, layout::unused, Group>;
-  template <typename Group>
-  using joint_matrix_b = joint_matrix<Tb, K, N, use::b, layout::unused, Group>;
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
   template <typename Group>
   using joint_matrix_accumulator =
-      joint_matrix<Tc, M, N, use::accumulator, layout::unused, Group>;
+      joint_matrix<Group, Tc, use::accumulator, M, N>;
 
   bool dynamic_p = false; // no dynamic allocation on the GPU
   uint32_t numtiles = -1; // does not apply for DPAS
@@ -391,13 +391,13 @@ struct tpu_params<
   static constexpr std::size_t K =
       (sK != 0) ? sK : ((sizeof(Ta) == 1) ? 32 : 16);
 
-  template <typename Group>
-  using joint_matrix_a = joint_matrix<Ta, M, K, use::a, layout::unused, Group>;
-  template <typename Group>
-  using joint_matrix_b = joint_matrix<Tb, K, N, use::b, layout::unused, Group>;
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
   template <typename Group>
   using joint_matrix_accumulator =
-      joint_matrix<Tc, M, N, use::accumulator, layout::unused, Group>;
+      joint_matrix<Group, Tc, use::accumulator, M, N>;
 
   bool dynamic_p = false; // no dynamic allocation on the GPU
   uint32_t numtiles = -1; // does not apply for DPAS
diff --git a/sycl/test/matrix/matrix-bf16-test-SG-16.cpp b/sycl/test/matrix/legacy/matrix-bf16-test-SG-16.cpp
similarity index 76%
rename from sycl/test/matrix/matrix-bf16-test-SG-16.cpp
rename to sycl/test/matrix/legacy/matrix-bf16-test-SG-16.cpp
index aa53fce4c72b9..541f4f75a4c71 100644
--- a/sycl/test/matrix/matrix-bf16-test-SG-16.cpp
+++ b/sycl/test/matrix/legacy/matrix-bf16-test-SG-16.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -O2 %s -o %t.out
+// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 %s -o %t.out
 #include <iostream>
 #include <sycl/sycl.hpp>
 
@@ -6,26 +6,28 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define TILE_SZ 16
-#define TM (TILE_SZ-1)
-#define TN (TILE_SZ-1)
-#define TK (2*TILE_SZ-2)
+#define TM (TILE_SZ - 1)
+#define TN (TILE_SZ - 1)
+#define TK (2 * TILE_SZ - 2)
 
 #define SG_SZ 16
 
-template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix{
+template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
 public:
   T *mat;
 
 public:
   T *get_data() { return mat; }
   void set_data(T *data) { mat = data; }
-  big_matrix(T *data) : mat(data) {
-  }
+  big_matrix(T *data) : mat(data) {}
 };
 
-template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A, size_t NUM_ROWS_B,
-          size_t NUM_COLS_B, size_t NUM_ROWS_C, size_t NUM_COLS_C>
-void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A, big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) {
+template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
+          size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
+          size_t NUM_COLS_C>
+void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
+                     big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A,
+                     big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) {
   size_t M = NUM_ROWS_C;
   size_t N = NUM_COLS_C;
   size_t K = NUM_COLS_A;
@@ -36,7 +38,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
   size_t NDRangeN = N / TN;
   buffer<unsigned short, 2> bufA(A.get_data(), range<2>(M, K));
   buffer<unsigned short, 2> bufB(B.get_data(), range<2>(K, N));
-  buffer<float, 2> bufC((float*)C.get_data(), range<2>(M, N));
+  buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
   q.submit([&](handler &cgh) {
@@ -46,7 +48,8 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
 
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N, K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
+         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
+             [[intel::reqd_sub_group_size(SG_SZ)]]
 
          {
            // The submatrix API has to be accessed by all the workitems in a
@@ -59,12 +62,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
 
            ext::oneapi::sub_group sg = spmd_item.get_sub_group();
            joint_matrix<unsigned short, TM, TK> sub_a(sg);
-           // For B, since current implementation does not support non-packed layout,
-           // users need to specify the updated VNNI sizes along with the packed_b layout.
-           // By default, the layout is row_major and size is (TK, TN).
-           joint_matrix<unsigned short, TK, TN, matrix_layout::packed_b> sub_b(sg);
+           // For B, since current implementation does not support non-packed
+           // layout, users need to specify the updated VNNI sizes along with
+           // the packed_b layout. By default, the layout is row_major and size
+           // is (TK, TN).
+           joint_matrix<unsigned short, TK, TN, matrix_layout::packed_b> sub_b(
+               sg);
            joint_matrix<float, TM, TN> sub_c(sg);
-           
+
            // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
            // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4
            joint_matrix_load(sg, sub_c,
@@ -98,34 +103,33 @@ unsigned short B[MATRIX_K / 2][MATRIX_N * 2];
 float C[MATRIX_M][MATRIX_N];
 float D[MATRIX_M][MATRIX_N];
 
-float make_fp32(short x)
-{
+float make_fp32(short x) {
   unsigned int y = x;
   y = y << 16;
-  float *res = reinterpret_cast<float*>(&y);
+  float *res = reinterpret_cast<float *>(&y);
   return *res;
 }
 
-unsigned short make_bf16(float x)
-{
-  int *res = reinterpret_cast<int*>(&x);
+unsigned short make_bf16(float x) {
+  int *res = reinterpret_cast<int *>(&x);
   *res = *res >> 16;
   return (unsigned short)*res;
 }
 
-void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) {
+void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N,
+                         int K) {
   // tiling
   for (int m = 0; m < M; m++)
     for (int n = 0; n < N; n++) {
       for (int k = 0; k < K; k++) {
-        short *va = (short *)(A_mem + m*K + k);
-        short *vb = (short *)(B_mem + k*N + n);
-        float acc = *((float*)(C_mem + m*N + n));
+        short *va = (short *)(A_mem + m * K + k);
+        short *vb = (short *)(B_mem + k * N + n);
+        float acc = *((float *)(C_mem + m * N + n));
         // FIXME: Should we do reduce-add in another version?
         for (int i = 0; i < 2; i++) {
           acc += (make_fp32(va[i]) * make_fp32(vb[i]));
         }
-        *((float*)(C_mem + m*N + n))= acc;
+        *((float *)(C_mem + m * N + n)) = acc;
       }
     }
 }
@@ -133,12 +137,12 @@ void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K
 int main() {
   for (int i = 0; i < MATRIX_M; i++) {
     for (int j = 0; j < MATRIX_K; j++) {
-      A[i][j] = make_bf16(1.0f * (i+j));
+      A[i][j] = make_bf16(1.0f * (i + j));
     }
   }
   for (int i = 0; i < MATRIX_K / 2; i++) {
     for (int j = 0; j < MATRIX_N * 2; j++) {
-      B[i][j] =  make_bf16(2.0f*i + 3.0f*j);
+      B[i][j] = make_bf16(2.0f * i + 3.0f * j);
     }
   }
   for (int i = 0; i < MATRIX_M; i++) {
@@ -151,10 +155,11 @@ int main() {
   big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);
   big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
   big_matrix<unsigned short, MATRIX_M, MATRIX_K> MA((unsigned short *)&A);
-  big_matrix<unsigned short,MATRIX_K / 2, MATRIX_N * 2> MB((unsigned short *)&B);
+  big_matrix<unsigned short, MATRIX_K / 2, MATRIX_N * 2> MB(
+      (unsigned short *)&B);
   matrix_multiply(MC, MA, MB);
   matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M,
-                    MATRIX_N, MATRIX_K / 2);
+                      MATRIX_N, MATRIX_K / 2);
 
   bool res = true;
   for (int i = 0; i < MATRIX_M; i++) {
diff --git a/sycl/test/matrix/matrix-bf16-test.cpp b/sycl/test/matrix/legacy/matrix-bf16-test.cpp
similarity index 78%
rename from sycl/test/matrix/matrix-bf16-test.cpp
rename to sycl/test/matrix/legacy/matrix-bf16-test.cpp
index 6169b4f9ca324..447e880afecbd 100644
--- a/sycl/test/matrix/matrix-bf16-test.cpp
+++ b/sycl/test/matrix/legacy/matrix-bf16-test.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -O2 %s -o %t.out
+// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 %s -o %t.out
 #include <iostream>
 #include <sycl/sycl.hpp>
 
@@ -6,26 +6,28 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define TILE_SZ 16
-#define TM (TILE_SZ-1)
-#define TN (TILE_SZ-1)
-#define TK (2 * TILE_SZ-2)
+#define TM (TILE_SZ - 1)
+#define TN (TILE_SZ - 1)
+#define TK (2 * TILE_SZ - 2)
 
 #define SG_SZ 16
 
-template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix{
+template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
 public:
   T *mat;
 
 public:
   T *get_data() { return mat; }
   void set_data(T *data) { mat = data; }
-  big_matrix(T *data) : mat(data) {
-  }
+  big_matrix(T *data) : mat(data) {}
 };
 
-template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A, size_t NUM_ROWS_B,
-          size_t NUM_COLS_B, size_t NUM_ROWS_C, size_t NUM_COLS_C>
-void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A, big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) {
+template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
+          size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
+          size_t NUM_COLS_C>
+void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
+                     big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A,
+                     big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) {
   size_t M = NUM_ROWS_C;
   size_t N = NUM_COLS_C;
   size_t K = NUM_COLS_A;
@@ -36,7 +38,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
   size_t NDRangeN = N / TN;
   buffer<unsigned short, 2> bufA(A.get_data(), range<2>(M, K));
   buffer<unsigned short, 2> bufB(B.get_data(), range<2>(K, N));
-  buffer<float, 2> bufC((float*)C.get_data(), range<2>(M, N));
+  buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
   q.submit([&](handler &cgh) {
@@ -59,10 +61,12 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
 
            ext::oneapi::sub_group sg = spmd_item.get_sub_group();
            joint_matrix<unsigned short, TM, TK> sub_a(sg);
-           // For B, since current implementation does not support non-packed layout,
-           // users need to specify the updated VNNI sizes along with the packed_b layout.
-           // By default, the layout is row_major and size is (TK, TN).
-           joint_matrix<unsigned short, TK, TN, matrix_layout::packed_b> sub_b(sg);
+           // For B, since current implementation does not support non-packed
+           // layout, users need to specify the updated VNNI sizes along with
+           // the packed_b layout. By default, the layout is row_major and size
+           // is (TK, TN).
+           joint_matrix<unsigned short, TK, TN, matrix_layout::packed_b> sub_b(
+               sg);
            joint_matrix<float, TM, TN> sub_c(sg);
 
            // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
@@ -98,34 +102,33 @@ unsigned short B[MATRIX_K / 2][MATRIX_N * 2];
 float C[MATRIX_M][MATRIX_N];
 float D[MATRIX_M][MATRIX_N];
 
-float make_fp32(short x)
-{
+float make_fp32(short x) {
   unsigned int y = x;
   y = y << 16;
-  float *res = reinterpret_cast<float*>(&y);
+  float *res = reinterpret_cast<float *>(&y);
   return *res;
 }
 
-unsigned short make_bf16(float x)
-{
-  int *res = reinterpret_cast<int*>(&x);
+unsigned short make_bf16(float x) {
+  int *res = reinterpret_cast<int *>(&x);
   *res = *res >> 16;
   return (unsigned short)*res;
 }
 
-void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) {
+void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N,
+                         int K) {
   // tiling
   for (int m = 0; m < M; m++)
     for (int n = 0; n < N; n++) {
       for (int k = 0; k < K; k++) {
-        short *va = (short *)(A_mem + m*K + k);
-        short *vb = (short *)(B_mem + k*N + n);
-        float acc = *((float*)(C_mem + m*N + n));
+        short *va = (short *)(A_mem + m * K + k);
+        short *vb = (short *)(B_mem + k * N + n);
+        float acc = *((float *)(C_mem + m * N + n));
         // FIXME: Should we do reduce-add in another version?
         for (int i = 0; i < 2; i++) {
           acc += (make_fp32(va[i]) * make_fp32(vb[i]));
         }
-        *((float*)(C_mem + m*N + n))= acc;
+        *((float *)(C_mem + m * N + n)) = acc;
       }
     }
 }
@@ -133,12 +136,12 @@ void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K
 int main() {
   for (int i = 0; i < MATRIX_M; i++) {
     for (int j = 0; j < MATRIX_K; j++) {
-      A[i][j] = make_bf16(1.0f * (i+j));
+      A[i][j] = make_bf16(1.0f * (i + j));
     }
   }
   for (int i = 0; i < MATRIX_K / 2; i++) {
     for (int j = 0; j < MATRIX_N * 2; j++) {
-      B[i][j] =  make_bf16(2.0f*i + 3.0f*j);
+      B[i][j] = make_bf16(2.0f * i + 3.0f * j);
     }
   }
   for (int i = 0; i < MATRIX_M; i++) {
@@ -151,10 +154,11 @@ int main() {
   big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);
   big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
   big_matrix<unsigned short, MATRIX_M, MATRIX_K> MA((unsigned short *)&A);
-  big_matrix<unsigned short,MATRIX_K / 2, MATRIX_N * 2> MB((unsigned short *)&B);
+  big_matrix<unsigned short, MATRIX_K / 2, MATRIX_N * 2> MB(
+      (unsigned short *)&B);
   matrix_multiply(MC, MA, MB);
   matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M,
-                    MATRIX_N, MATRIX_K / 2);
+                      MATRIX_N, MATRIX_K / 2);
 
   bool res = true;
   for (int i = 0; i < MATRIX_M; i++) {
diff --git a/sycl/test/matrix/matrix-bfloat16-test-use.cpp b/sycl/test/matrix/legacy/matrix-bfloat16-test.cpp
similarity index 91%
rename from sycl/test/matrix/matrix-bfloat16-test-use.cpp
rename to sycl/test/matrix/legacy/matrix-bfloat16-test.cpp
index f133b5d5bd9cc..b87b63e29cc98 100644
--- a/sycl/test/matrix/matrix-bfloat16-test-use.cpp
+++ b/sycl/test/matrix/legacy/matrix-bfloat16-test.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out
+// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 %s -o %t.out
 #include <iostream>
 #include <sycl/sycl.hpp>
 
@@ -60,33 +60,35 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
            sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<bfloat16, TM, TK, use::a> sub_a(sg);
+           joint_matrix<bfloat16, TM, TK> sub_a(sg);
            // For B, since current implementation does not support non-packed
            // layout, users need to specify the updated VNNI sizes along with
            // the packed_b layout. By default, the layout is row_major and size
            // is (TK, TN).
-           joint_matrix<bfloat16, TK, TN, use::b> sub_b(sg);
-           joint_matrix<float, TM, TN, use::accumulator> sub_c(sg);
+           joint_matrix<bfloat16, TK, TN, matrix_layout::packed_b> sub_b(sg);
+           joint_matrix<float, TM, TN> sub_c(sg);
 
+           // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
+           // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4
            joint_matrix_load(sg, sub_c,
                              accC.get_pointer() + (sg_startx * TM) * N +
                                  sg_starty / SG_SZ * TN,
-                             N, layout::row_major);
+                             N, matrix_layout::row_major);
            for (int k = 0; k < K / TK; k += 1) { //
              joint_matrix_load(
                  sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK,
-                 K, layout::row_major);
+                 K, matrix_layout::row_major);
              // Assuming B data is already in VNNI format.
              joint_matrix_load(sg, sub_b,
                                accB.get_pointer() + (k * TK / 2) * (N * 2) +
                                    sg_starty / SG_SZ * TN * 2,
-                               N * 2, layout::packed_b);
+                               N * 2, matrix_layout::packed_b);
              sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(sg, sub_c,
                               accC.get_pointer() + (sg_startx * TM) * N +
                                   sg_starty / SG_SZ * TN,
-                              N, layout::row_major);
+                              N, matrix_layout::row_major);
          }); // parallel for
    }).wait();
 }
diff --git a/sycl/test/matrix/legacy/matrix-elemwise-ops.cpp b/sycl/test/matrix/legacy/matrix-elemwise-ops.cpp
new file mode 100644
index 0000000000000..01efab63246d0
--- /dev/null
+++ b/sycl/test/matrix/legacy/matrix-elemwise-ops.cpp
@@ -0,0 +1,175 @@
+// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 %s -o %t.out
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+#define TILE_SZ 16
+#define TM (TILE_SZ - 4)
+#define TN (TILE_SZ - 4)
+#define TK (4 * TILE_SZ - 16)
+
+#define SG_SZ 16
+
+template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
+public:
+  T *mat;
+
+public:
+  T *get_data() { return mat; }
+  void set_data(T *data) { mat = data; }
+  big_matrix(T *data) : mat(data) {}
+};
+
+template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
+          size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
+          size_t NUM_COLS_C>
+void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
+                     big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A,
+                     big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) {
+  size_t M = NUM_ROWS_C;
+  size_t N = NUM_COLS_C;
+  size_t K = NUM_COLS_A;
+  // B => K/4 x N*4, A => M x K, C => M, N
+  // stride should be X's cols, e.g., B's stirde = N*4
+  assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4);
+  size_t NDRangeM = M / TM;
+  size_t NDRangeN = N / TN;
+  buffer<int8_t, 2> bufA(A.get_data(), range<2>(M, K));
+  buffer<int8_t, 2> bufB(B.get_data(), range<2>(K, N));
+  buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
+
+  queue q;
+  q.submit([&](handler &cgh) {
+     auto accC = bufC.get_access<access::mode::read_write>(cgh);
+     auto accA = bufA.get_access<access::mode::read_write>(cgh);
+     auto accB = bufB.get_access<access::mode::read_write>(cgh);
+
+     cgh.parallel_for<class imatrix>(
+         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
+         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
+
+         {
+           // The submatrix API has to be accessed by all the workitems in a
+           // subgroup these functions will be called once by the subgroup no
+           // code divergence between the workitems
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+           ext::oneapi::sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<int8_t, TM, TK> sub_a(sg);
+           // For B, since current implementation does not support non-packed
+           // layout, users need to specify the updated VNNI sizes along with
+           // the packed_b layout. By default, the layout is row_major and size
+           // is (TK, TN).
+           joint_matrix<int8_t, TK, TN, matrix_layout::packed_b> sub_b(sg);
+           joint_matrix<int32_t, TM, TN> sub_c(sg);
+
+           // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
+           // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4
+           joint_matrix_load(sg, sub_c,
+                             accC.get_pointer() + (sg_startx * TM) * N +
+                                 sg_starty / SG_SZ * TN,
+                             N, matrix_layout::row_major);
+           for (int k = 0; k < K / TK; k += 1) {
+             joint_matrix_load(
+                 sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK,
+                 K, matrix_layout::row_major);
+             // Assuming B data is already in VNNI format.
+             joint_matrix_load(sg, sub_b,
+                               accB.get_pointer() + (k * TK / 4) * (N * 4) +
+                                   sg_starty / SG_SZ * TN * 4,
+                               N * 4, matrix_layout::packed_b);
+             sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+           }
+           auto wi_data_c = sub_c.get_wi_data();
+           for (int i = 0; i < wi_data_c.length(); i++) {
+             wi_data_c[i] *= 2;
+           }
+           joint_matrix_store(sg, sub_c,
+                              accC.get_pointer() + (sg_startx * TM) * N +
+                                  sg_starty / SG_SZ * TN,
+                              N, matrix_layout::row_major);
+         }); // parallel for
+   }).wait();
+}
+
+static constexpr size_t MATRIX_M = TM * 2;
+static constexpr size_t MATRIX_N = TN * 2;
+static constexpr size_t MATRIX_K = TK * 2;
+int8_t A[MATRIX_M][MATRIX_K];
+int8_t B[MATRIX_K / 4][MATRIX_N * 4];
+int32_t C[MATRIX_M][MATRIX_N];
+int32_t D[MATRIX_M][MATRIX_N];
+
+void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M,
+                         int N, int K) {
+  // tiling
+  for (int m = 0; m < M; m++)
+    for (int n = 0; n < N; n++) {
+      for (int k = 0; k < K; k++) {
+        char *va = (char *)(A_mem + m * K + k);
+        char *vb = (char *)(B_mem + k * N + n);
+        int acc = *(C_mem + m * N + n);
+        for (int i = 0; i < 4; i++) {
+          acc += (va[i] * vb[i]);
+        }
+        *(C_mem + m * N + n) = acc;
+      }
+      *(C_mem + m * N + n) *= 2;
+    }
+}
+
+int main() {
+  for (int i = 0; i < MATRIX_M; i++) {
+    for (int j = 0; j < MATRIX_K; j++) {
+      A[i][j] = i + 2 * j;
+    }
+  }
+  for (int i = 0; i < MATRIX_K / 4; i++) {
+    for (int j = 0; j < MATRIX_N * 4; j++) {
+      B[i][j] = i + j;
+    }
+  }
+  for (int i = 0; i < MATRIX_M; i++) {
+    for (int j = 0; j < MATRIX_N; j++) {
+      C[i][j] = 1;
+      D[i][j] = 1;
+    }
+  }
+
+  big_matrix<int32_t, MATRIX_M, MATRIX_N> MC((int32_t *)&C);
+  big_matrix<int32_t, MATRIX_M, MATRIX_N> MD((int32_t *)&D);
+  big_matrix<int8_t, MATRIX_M, MATRIX_K> MA((int8_t *)&A);
+  big_matrix<int8_t, MATRIX_K / 4, MATRIX_N * 4> MB((int8_t *)&B);
+  matrix_multiply(MC, MA, MB);
+  matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M,
+                      MATRIX_N, MATRIX_K / 4);
+
+  bool res = true;
+  for (int i = 0; i < MATRIX_M; i++) {
+    for (int j = 0; j < MATRIX_N; j++) {
+      if (C[i][j] != D[i][j])
+        res = false;
+    }
+  }
+  if (res)
+    std::cout << "passed\n";
+  else
+    std::cout << "failed\n";
+  for (int i = 0; i < MATRIX_M; i++) {
+    for (int j = 0; j < MATRIX_N; j++)
+      std::cout << C[i][j] << ", ";
+    std::cout << "\n";
+  }
+  std::cout << std::endl;
+  for (int i = 0; i < MATRIX_M; i++) {
+    for (int j = 0; j < MATRIX_N; j++)
+      std::cout << D[i][j] << ", ";
+    std::cout << "\n";
+  }
+}
diff --git a/sycl/test/matrix/matrix-int8-test-SG-16.cpp b/sycl/test/matrix/legacy/matrix-int8-test-SG-16.cpp
similarity index 83%
rename from sycl/test/matrix/matrix-int8-test-SG-16.cpp
rename to sycl/test/matrix/legacy/matrix-int8-test-SG-16.cpp
index 35834273720ff..ed6408f74076a 100644
--- a/sycl/test/matrix/matrix-int8-test-SG-16.cpp
+++ b/sycl/test/matrix/legacy/matrix-int8-test-SG-16.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -O2 %s -o %t.out
+// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 %s -o %t.out
 #include <iostream>
 #include <sycl/sycl.hpp>
 
@@ -6,26 +6,28 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define TILE_SZ 16
-#define TM (TILE_SZ-5)
-#define TN (TILE_SZ-6)
-#define TK (4 * TILE_SZ-8)
+#define TM (TILE_SZ - 5)
+#define TN (TILE_SZ - 6)
+#define TK (4 * TILE_SZ - 8)
 
 #define SG_SZ 16
 
-template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix{
+template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
 public:
   T *mat;
 
 public:
   T *get_data() { return mat; }
   void set_data(T *data) { mat = data; }
-  big_matrix(T *data) : mat(data) {
-  }
+  big_matrix(T *data) : mat(data) {}
 };
 
-template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A, size_t NUM_ROWS_B,
-          size_t NUM_COLS_B, size_t NUM_ROWS_C, size_t NUM_COLS_C>
-void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A, big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) {
+template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
+          size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
+          size_t NUM_COLS_C>
+void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
+                     big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A,
+                     big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) {
   size_t M = NUM_ROWS_C;
   size_t N = NUM_COLS_C;
   size_t K = NUM_COLS_A;
@@ -46,7 +48,8 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
 
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)  [[intel::reqd_sub_group_size(SG_SZ)]]
+         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
+             [[intel::reqd_sub_group_size(SG_SZ)]]
 
          {
            // The submatrix API has to be accessed by all the workitems in a
@@ -59,9 +62,10 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
 
            ext::oneapi::sub_group sg = spmd_item.get_sub_group();
            joint_matrix<int8_t, TM, TK> sub_a(sg);
-           // For B, since current implementation does not support non-packed layout,
-           // users need to specify the updated VNNI sizes along with the packed_b layout.
-           // By default, the layout is row_major and size is (TK, TN).
+           // For B, since current implementation does not support non-packed
+           // layout, users need to specify the updated VNNI sizes along with
+           // the packed_b layout. By default, the layout is row_major and size
+           // is (TK, TN).
            joint_matrix<int8_t, TK, TN, matrix_layout::packed_b> sub_b(sg);
            joint_matrix<int32_t, TM, TN> sub_c(sg);
 
@@ -99,7 +103,7 @@ int32_t C[MATRIX_M][MATRIX_N];
 int32_t D[MATRIX_M][MATRIX_N];
 
 void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M,
-                       int N, int K) {
+                         int N, int K) {
   // tiling
   for (int m = 0; m < M; m++)
     for (int n = 0; n < N; n++) {
@@ -118,12 +122,12 @@ void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M,
 int main() {
   for (int i = 0; i < MATRIX_M; i++) {
     for (int j = 0; j < MATRIX_K; j++) {
-      A[i][j] = i+2*j;
+      A[i][j] = i + 2 * j;
     }
   }
   for (int i = 0; i < MATRIX_K / 4; i++) {
     for (int j = 0; j < MATRIX_N * 4; j++) {
-      B[i][j] = i+j;
+      B[i][j] = i + j;
     }
   }
   for (int i = 0; i < MATRIX_M; i++) {
@@ -136,10 +140,10 @@ int main() {
   big_matrix<int32_t, MATRIX_M, MATRIX_N> MC((int32_t *)&C);
   big_matrix<int32_t, MATRIX_M, MATRIX_N> MD((int32_t *)&D);
   big_matrix<int8_t, MATRIX_M, MATRIX_K> MA((int8_t *)&A);
-  big_matrix<int8_t,MATRIX_K / 4, MATRIX_N * 4> MB((int8_t *)&B);
+  big_matrix<int8_t, MATRIX_K / 4, MATRIX_N * 4> MB((int8_t *)&B);
   matrix_multiply(MC, MA, MB);
   matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M,
-                    MATRIX_N, MATRIX_K / 4);
+                      MATRIX_N, MATRIX_K / 4);
 
   bool res = true;
   for (int i = 0; i < MATRIX_M; i++) {
diff --git a/sycl/test/matrix/matrix-int8-test-use.cpp b/sycl/test/matrix/legacy/matrix-int8-test.cpp
similarity index 88%
rename from sycl/test/matrix/matrix-int8-test-use.cpp
rename to sycl/test/matrix/legacy/matrix-int8-test.cpp
index ff730dfa78055..a0c2edb62c2f1 100644
--- a/sycl/test/matrix/matrix-int8-test-use.cpp
+++ b/sycl/test/matrix/legacy/matrix-int8-test.cpp
@@ -1,8 +1,8 @@
-// RUN: %clangxx -fsycl -fsycl-device-only -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 -O2 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clangxx -fsycl -fsycl-device-only -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 -S -emit-llvm -o - %s | FileCheck %s
 
-// CHECK-DAG: %spirv.JointMatrixINTEL._char_12_48_4_3_0 = type opaque
-// CHECK-DAG: %spirv.JointMatrixINTEL._int_12_12_4_3_2 = type opaque
-// CHECK-DAG: %spirv.JointMatrixINTEL._char_48_12_4_3_1 = type opaque
+// CHECK-DAG: %spirv.JointMatrixINTEL._char_12_48_0_3 = type opaque
+// CHECK-DAG: %spirv.JointMatrixINTEL._int_12_12_0_3 = type opaque
+// CHECK-DAG: %spirv.JointMatrixINTEL._char_48_12_3_3 = type opaque
 
 #include <iostream>
 #include <sycl/sycl.hpp>
@@ -54,7 +54,6 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
          [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
-             [[intel::reqd_sub_group_size(SG_SZ)]]
 
          {
            // The submatrix API has to be accessed by all the workitems in a
@@ -66,13 +65,13 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
            ext::oneapi::sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<int8_t, TM, TK, use::a> sub_a(sg);
+           joint_matrix<int8_t, TM, TK> sub_a(sg);
            // For B, since current implementation does not support non-packed
            // layout, users need to specify the updated VNNI sizes along with
            // the packed_b layout. By default, the layout is row_major and size
            // is (TK, TN).
-           joint_matrix<int8_t, TK, TN, use::b> sub_b(sg);
-           joint_matrix<int32_t, TM, TN, use::accumulator> sub_c(sg);
+           joint_matrix<int8_t, TK, TN, matrix_layout::packed_b> sub_b(sg);
+           joint_matrix<int32_t, TM, TN> sub_c(sg);
 
            // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
            // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4
@@ -80,18 +79,18 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
                  sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK,
-                 K, layout::row_major);
+                 K, matrix_layout::row_major);
              // Assuming B data is already in VNNI format.
              joint_matrix_load(sg, sub_b,
                                accB.get_pointer() + (k * TK / 4) * (N * 4) +
                                    sg_starty / SG_SZ * TN * 4,
-                               N * 4, layout::packed_b);
+                               N * 4, matrix_layout::packed_b);
              sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(sg, sub_c,
                               accC.get_pointer() + (sg_startx * TM) * N +
                                   sg_starty / SG_SZ * TN,
-                              N, layout::row_major);
+                              N, matrix_layout::row_major);
          }); // parallel for
    }).wait();
 }
diff --git a/sycl/test/matrix/matrix-bfloat16-test.cpp b/sycl/test/matrix/matrix-bfloat16-test.cpp
index f4a7262b9fd89..2f8fcd4efc6fc 100644
--- a/sycl/test/matrix/matrix-bfloat16-test.cpp
+++ b/sycl/test/matrix/matrix-bfloat16-test.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -O2 %s -o %t.out
+// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out
 #include <iostream>
 #include <sycl/sycl.hpp>
 
@@ -59,36 +59,38 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            const auto sg_startx = global_idx - spmd_item.get_local_id(0);
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
-           sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<bfloat16, TM, TK> sub_a(sg);
+           sycl::sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sycl::sub_group, bfloat16, use::a, TM, TK,
+                        layout::row_major>
+               sub_a;
            // For B, since current implementation does not support non-packed
            // layout, users need to specify the updated VNNI sizes along with
            // the packed_b layout. By default, the layout is row_major and size
            // is (TK, TN).
-           joint_matrix<bfloat16, TK, TN, matrix_layout::packed_b> sub_b(sg);
-           joint_matrix<float, TM, TN> sub_c(sg);
+           joint_matrix<sycl::sub_group, bfloat16, use::b, TK, TN,
+                        sycl::ext::intel::experimental::matrix::layout::packed>
+               sub_b;
+           joint_matrix<sycl::sub_group, float, use::accumulator, TM, TN> sub_c;
 
-           // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
-           // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4
            joint_matrix_load(sg, sub_c,
                              accC.get_pointer() + (sg_startx * TM) * N +
                                  sg_starty / SG_SZ * TN,
-                             N, matrix_layout::row_major);
+                             N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) { //
              joint_matrix_load(
                  sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK,
-                 K, matrix_layout::row_major);
+                 K);
              // Assuming B data is already in VNNI format.
              joint_matrix_load(sg, sub_b,
                                accB.get_pointer() + (k * TK / 2) * (N * 2) +
                                    sg_starty / SG_SZ * TN * 2,
-                               N * 2, matrix_layout::packed_b);
+                               N * 2);
              sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(sg, sub_c,
                               accC.get_pointer() + (sg_startx * TM) * N +
                                   sg_starty / SG_SZ * TN,
-                              N, matrix_layout::row_major);
+                              N, layout::row_major);
          }); // parallel for
    }).wait();
 }
diff --git a/sycl/test/matrix/matrix-elemwise-ops.cpp b/sycl/test/matrix/matrix-elemwise-ops.cpp
index 842678a9224a5..5da46c6eb7d49 100644
--- a/sycl/test/matrix/matrix-elemwise-ops.cpp
+++ b/sycl/test/matrix/matrix-elemwise-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -O2 %s -o %t.out
+// RUN: %clangxx -fsycl -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -O2 %s -o %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
@@ -60,40 +60,45 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            const auto sg_startx = global_idx - spmd_item.get_local_id(0);
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
-           ext::oneapi::sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<int8_t, TM, TK> sub_a(sg);
+           sycl::sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sycl::sub_group, int8_t, use::a, TM, TK,
+                        layout::row_major>
+               sub_a;
            // For B, since current implementation does not support non-packed
            // layout, users need to specify the updated VNNI sizes along with
            // the packed_b layout. By default, the layout is row_major and size
            // is (TK, TN).
-           joint_matrix<int8_t, TK, TN, matrix_layout::packed_b> sub_b(sg);
-           joint_matrix<int32_t, TM, TN> sub_c(sg);
+           joint_matrix<sycl::sub_group, int8_t, use::b, TK, TN,
+                        sycl::ext::intel::experimental::matrix::layout::packed>
+               sub_b;
+           joint_matrix<sycl::sub_group, int32_t, use::accumulator, TM, TN>
+               sub_c;
 
            // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
            // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4
            joint_matrix_load(sg, sub_c,
                              accC.get_pointer() + (sg_startx * TM) * N +
                                  sg_starty / SG_SZ * TN,
-                             N, matrix_layout::row_major);
+                             N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
                  sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK,
-                 K, matrix_layout::row_major);
+                 K);
              // Assuming B data is already in VNNI format.
              joint_matrix_load(sg, sub_b,
                                accB.get_pointer() + (k * TK / 4) * (N * 4) +
                                    sg_starty / SG_SZ * TN * 4,
-                               N * 4, matrix_layout::packed_b);
+                               N * 4);
              sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
            }
-           auto wi_data_c = sub_c.get_wi_data();
+           auto wi_data_c = get_wi_data(sg, sub_c);
            for (int i = 0; i < wi_data_c.length(); i++) {
              wi_data_c[i] *= 2;
            }
            joint_matrix_store(sg, sub_c,
                               accC.get_pointer() + (sg_startx * TM) * N +
                                   sg_starty / SG_SZ * TN,
-                              N, matrix_layout::row_major);
+                              N, layout::row_major);
          }); // parallel for
    }).wait();
 }
diff --git a/sycl/test/matrix/matrix-int8-test.cpp b/sycl/test/matrix/matrix-int8-test.cpp
index e49dfe0c358c3..de8721bca3b09 100644
--- a/sycl/test/matrix/matrix-int8-test.cpp
+++ b/sycl/test/matrix/matrix-int8-test.cpp
@@ -1,8 +1,8 @@
-// RUN: %clangxx -fsycl -fsycl-device-only -O2 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clangxx -fsycl -fsycl-device-only -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -O2 -S -emit-llvm -o - %s | FileCheck %s
 
-// CHECK-DAG: %spirv.JointMatrixINTEL._char_12_48_0_3 = type opaque
-// CHECK-DAG: %spirv.JointMatrixINTEL._int_12_12_0_3 = type opaque
-// CHECK-DAG: %spirv.JointMatrixINTEL._char_48_12_3_3 = type opaque
+// CHECK-DAG: %spirv.JointMatrixINTEL._char_12_48_0_3_0 = type opaque
+// CHECK-DAG: %spirv.JointMatrixINTEL._int_12_12_3_3_2 = type opaque
+// CHECK-DAG: %spirv.JointMatrixINTEL._char_48_12_2_3_1 = type opaque
 
 #include <iostream>
 #include <sycl/sycl.hpp>
@@ -11,26 +11,28 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define TILE_SZ 16
-#define TM (TILE_SZ-4)
-#define TN (TILE_SZ-4)
-#define TK (4 * TILE_SZ-16)
+#define TM (TILE_SZ - 4)
+#define TN (TILE_SZ - 4)
+#define TK (4 * TILE_SZ - 16)
 
 #define SG_SZ 16
 
-template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix{
+template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
 public:
   T *mat;
 
 public:
   T *get_data() { return mat; }
   void set_data(T *data) { mat = data; }
-  big_matrix(T *data) : mat(data) {
-  }
+  big_matrix(T *data) : mat(data) {}
 };
 
-template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A, size_t NUM_ROWS_B,
-          size_t NUM_COLS_B, size_t NUM_ROWS_C, size_t NUM_COLS_C>
-void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A, big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) {
+template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
+          size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
+          size_t NUM_COLS_C>
+void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
+                     big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A,
+                     big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) {
   size_t M = NUM_ROWS_C;
   size_t N = NUM_COLS_C;
   size_t K = NUM_COLS_A;
@@ -52,6 +54,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
          [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
+             [[intel::reqd_sub_group_size(SG_SZ)]]
 
          {
            // The submatrix API has to be accessed by all the workitems in a
@@ -62,13 +65,19 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
            const auto sg_startx = global_idx - spmd_item.get_local_id(0);
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
-           ext::oneapi::sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<int8_t, TM, TK> sub_a(sg);
-           // For B, since current implementation does not support non-packed layout,
-           // users need to specify the updated VNNI sizes along with the packed_b layout.
-           // By default, the layout is row_major and size is (TK, TN).
-           joint_matrix<int8_t, TK, TN, matrix_layout::packed_b> sub_b(sg);
-           joint_matrix<int32_t, TM, TN> sub_c(sg);
+           sycl::sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sycl::sub_group, int8_t, use::a, TM, TK,
+                        layout::row_major>
+               sub_a;
+           // For B, since current implementation does not support non-packed
+           // layout, users need to specify the updated VNNI sizes along with
+           // the packed_b layout. By default, the layout is row_major and size
+           // is (TK, TN).
+           joint_matrix<sycl::sub_group, int8_t, use::b, TK, TN,
+                        sycl::ext::intel::experimental::matrix::layout::packed>
+               sub_b;
+           joint_matrix<sycl::sub_group, int32_t, use::accumulator, TM, TN>
+               sub_c;
 
            // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
            // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4
@@ -76,18 +85,18 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
                  sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK,
-                 K, matrix_layout::row_major);
+                 K);
              // Assuming B data is already in VNNI format.
              joint_matrix_load(sg, sub_b,
                                accB.get_pointer() + (k * TK / 4) * (N * 4) +
                                    sg_starty / SG_SZ * TN * 4,
-                               N * 4, matrix_layout::packed_b);
+                               N * 4);
              sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(sg, sub_c,
                               accC.get_pointer() + (sg_startx * TM) * N +
                                   sg_starty / SG_SZ * TN,
-                              N, matrix_layout::row_major);
+                              N, layout::row_major);
          }); // parallel for
    }).wait();
 }
@@ -101,7 +110,7 @@ int32_t C[MATRIX_M][MATRIX_N];
 int32_t D[MATRIX_M][MATRIX_N];
 
 void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M,
-                       int N, int K) {
+                         int N, int K) {
   // tiling
   for (int m = 0; m < M; m++)
     for (int n = 0; n < N; n++) {
@@ -120,12 +129,12 @@ void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M,
 int main() {
   for (int i = 0; i < MATRIX_M; i++) {
     for (int j = 0; j < MATRIX_K; j++) {
-      A[i][j] = i+2*j;
+      A[i][j] = i + 2 * j;
     }
   }
   for (int i = 0; i < MATRIX_K / 4; i++) {
     for (int j = 0; j < MATRIX_N * 4; j++) {
-      B[i][j] = i+j;
+      B[i][j] = i + j;
     }
   }
   for (int i = 0; i < MATRIX_M; i++) {
@@ -138,10 +147,10 @@ int main() {
   big_matrix<int32_t, MATRIX_M, MATRIX_N> MC((int32_t *)&C);
   big_matrix<int32_t, MATRIX_M, MATRIX_N> MD((int32_t *)&D);
   big_matrix<int8_t, MATRIX_M, MATRIX_K> MA((int8_t *)&A);
-  big_matrix<int8_t,MATRIX_K / 4, MATRIX_N * 4> MB((int8_t *)&B);
+  big_matrix<int8_t, MATRIX_K / 4, MATRIX_N * 4> MB((int8_t *)&B);
   matrix_multiply(MC, MA, MB);
   matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M,
-                    MATRIX_N, MATRIX_K / 4);
+                      MATRIX_N, MATRIX_K / 4);
 
   bool res = true;
   for (int i = 0; i < MATRIX_M; i++) {
diff --git a/sycl/test/matrix/query-use.cpp b/sycl/test/matrix/query-use.cpp
index be6a6a3280996..6239d05d5f79f 100644
--- a/sycl/test/matrix/query-use.cpp
+++ b/sycl/test/matrix/query-use.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 -fsycl -o query-use %s
+// RUN: %clangxx -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -fsycl -o query-use %s
 #include <iostream>
 #include <sycl/sycl.hpp>
 
@@ -55,13 +55,15 @@ void query_amx() {
         nd_range<2>({NDRangeM, NDRangeN}, {1, 1}),
         [msize, ksize, nsize](nd_item<2> spmd_item) {
           sub_group sg = spmd_item.get_sub_group();
-          myparams2::joint_matrix_a<sub_group> sub_a1(sg);
-          myparams2::joint_matrix_b<sub_group> sub_b1(sg);
-          myparams2::joint_matrix_accumulator<sub_group> sub_c1(sg);
-
-          joint_matrix<unsigned short, msize, ksize, use::a> sub_a(sg);
-          joint_matrix<unsigned short, ksize, nsize, use::b> sub_b(sg);
-          joint_matrix<float, msize, nsize, use::accumulator> sub_c(sg);
+          myparams2::joint_matrix_a<sub_group, layout::row_major> sub_a1;
+          myparams2::joint_matrix_b<
+              sub_group, sycl::ext::intel::experimental::matrix::layout::packed>
+              sub_b1;
+          myparams2::joint_matrix_accumulator<sub_group> sub_c1;
+
+          joint_matrix<sub_group, unsigned short, use::a, msize, ksize> sub_a;
+          joint_matrix<sub_group, unsigned short, use::b, ksize, nsize> sub_b;
+          joint_matrix<sub_group, float, use::accumulator, msize, nsize> sub_c;
         });
   });
 }
@@ -125,13 +127,15 @@ void query_dpas() {
         nd_range<2>({NDRangeM, NDRangeN}, {1, 1}),
         [msize, ksize, nsize](nd_item<2> spmd_item) {
           sub_group sg = spmd_item.get_sub_group();
-          myparams2::joint_matrix_a<sub_group> sub_a1(sg);
-          myparams2::joint_matrix_b<sub_group> sub_b1(sg);
-          myparams2::joint_matrix_accumulator<sub_group> sub_c1(sg);
-
-          joint_matrix<unsigned short, msize, ksize, use::a> sub_a(sg);
-          joint_matrix<unsigned short, ksize, nsize, use::b> sub_b(sg);
-          joint_matrix<float, msize, nsize, use::accumulator> sub_c(sg);
+          myparams2::joint_matrix_a<sub_group, layout::row_major> sub_a1;
+          myparams2::joint_matrix_b<
+              sub_group, sycl::ext::intel::experimental::matrix::layout::packed>
+              sub_b1;
+          myparams2::joint_matrix_accumulator<sub_group> sub_c1;
+
+          joint_matrix<sub_group, unsigned short, use::a, msize, ksize> sub_a;
+          joint_matrix<sub_group, unsigned short, use::b, ksize, nsize> sub_b;
+          joint_matrix<sub_group, float, use::accumulator, msize, nsize> sub_c;
         });
   });
 }

From 22610c6bc85697c477a6b927cf50461de7ac313e Mon Sep 17 00:00:00 2001
From: smaslov-intel <sergey.v.maslov@intel.com>
Date: Fri, 16 Dec 2022 08:39:24 -0800
Subject: [PATCH 05/11] [SYCL] Fuse zeCommandListAppendWaitOnEvents into
 subsequent memory copy command (#7807)

Signed-off-by: Sergey V Maslov <sergey.v.maslov@intel.com>
---
 sycl/plugins/level_zero/pi_level_zero.cpp | 35 ++++++-----------------
 1 file changed, 9 insertions(+), 26 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 6c93a6e82c0aa..8e0806c70f6b5 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -6929,11 +6929,6 @@ enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst,
 
   const auto &ZeCommandList = CommandList->first;
   const auto &WaitList = (*Event)->WaitList;
-  if (WaitList.Length) {
-
-    ZE_CALL(zeCommandListAppendWaitOnEvents,
-            (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
-  }
 
   zePrint("calling zeCommandListAppendMemoryCopy() with\n"
           "  ZeEvent %#lx\n",
@@ -6941,7 +6936,8 @@ enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst,
   printZeEventList(WaitList);
 
   ZE_CALL(zeCommandListAppendMemoryCopy,
-          (ZeCommandList, Dst, Src, Size, ZeEvent, 0, nullptr));
+          (ZeCommandList, Dst, Src, Size, ZeEvent, WaitList.Length,
+           WaitList.ZeEventList));
 
   if (auto Res =
           Queue->executeCommandList(CommandList, BlockingWrite, OkToBatch))
@@ -6993,10 +6989,6 @@ static pi_result enqueueMemCopyRectHelper(
   const auto &ZeCommandList = CommandList->first;
   const auto &WaitList = (*Event)->WaitList;
 
-  if (WaitList.Length) {
-    ZE_CALL(zeCommandListAppendWaitOnEvents,
-            (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
-  }
   zePrint("calling zeCommandListAppendMemoryCopy() with\n"
           "  ZeEvent %#lx\n",
           pi_cast<std::uintptr_t>(ZeEvent));
@@ -7035,8 +7027,8 @@ static pi_result enqueueMemCopyRectHelper(
 
   ZE_CALL(zeCommandListAppendMemoryCopyRegion,
           (ZeCommandList, DstBuffer, &ZeDstRegion, DstPitch, DstSlicePitch,
-           SrcBuffer, &ZeSrcRegion, SrcPitch, SrcSlicePitch, nullptr, 0,
-           nullptr));
+           SrcBuffer, &ZeSrcRegion, SrcPitch, SrcSlicePitch, nullptr,
+           WaitList.Length, WaitList.ZeEventList));
 
   zePrint("calling zeCommandListAppendMemoryCopyRegion()\n");
 
@@ -7250,14 +7242,9 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr,
   const auto &ZeCommandList = CommandList->first;
   const auto &WaitList = (*Event)->WaitList;
 
-  if (WaitList.Length) {
-    ZE_CALL(zeCommandListAppendWaitOnEvents,
-            (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
-  }
-
-  ZE_CALL(
-      zeCommandListAppendMemoryFill,
-      (ZeCommandList, Ptr, Pattern, PatternSize, Size, ZeEvent, 0, nullptr));
+  ZE_CALL(zeCommandListAppendMemoryFill,
+          (ZeCommandList, Ptr, Pattern, PatternSize, Size, ZeEvent,
+           WaitList.Length, WaitList.ZeEventList));
 
   zePrint("calling zeCommandListAppendMemoryFill() with\n"
           "  ZeEvent %#lx\n",
@@ -7669,10 +7656,6 @@ static pi_result enqueueMemImageCommandHelper(
   const auto &ZeCommandList = CommandList->first;
   const auto &WaitList = (*Event)->WaitList;
 
-  if (WaitList.Length) {
-    ZE_CALL(zeCommandListAppendWaitOnEvents,
-            (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
-  }
   if (CommandType == PI_COMMAND_TYPE_IMAGE_READ) {
     pi_mem SrcMem = pi_cast<pi_mem>(const_cast<void *>(Src));
 
@@ -7709,7 +7692,7 @@ static pi_result enqueueMemImageCommandHelper(
         SrcMem->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device));
     ZE_CALL(zeCommandListAppendImageCopyToMemory,
             (ZeCommandList, Dst, pi_cast<ze_image_handle_t>(ZeHandleSrc),
-             &ZeSrcRegion, ZeEvent, 0, nullptr));
+             &ZeSrcRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList));
   } else if (CommandType == PI_COMMAND_TYPE_IMAGE_WRITE) {
     pi_mem DstMem = pi_cast<pi_mem>(Dst);
     ze_image_region_t ZeDstRegion;
@@ -7743,7 +7726,7 @@ static pi_result enqueueMemImageCommandHelper(
         DstMem->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device));
     ZE_CALL(zeCommandListAppendImageCopyFromMemory,
             (ZeCommandList, pi_cast<ze_image_handle_t>(ZeHandleDst), Src,
-             &ZeDstRegion, ZeEvent, 0, nullptr));
+             &ZeDstRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList));
   } else if (CommandType == PI_COMMAND_TYPE_IMAGE_COPY) {
     pi_mem SrcImage = pi_cast<pi_mem>(const_cast<void *>(Src));
     pi_mem DstImage = pi_cast<pi_mem>(Dst);

From ea44995a89a7b06aacf6653c7ff87d9de1228029 Mon Sep 17 00:00:00 2001
From: Kseniya Tikhomirova <kseniya.tikhomirova@intel.com>
Date: Fri, 16 Dec 2022 18:28:28 +0100
Subject: [PATCH 06/11] [SYCL] Fix scheduler unit test:
 EnqueueNoMemObjDoubleKernelDepHostBlocked (#7793)

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
index af94d48b8c514..97aefc8db7b36 100644
--- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
+++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
@@ -227,7 +227,8 @@ TEST_F(DependsOnTests, EnqueueNoMemObjDoubleKernelDepHostBlocked) {
   // kernels on host task completion
   std::vector<EventImplPtr> Events;
 
-  detail::Command *Cmd1 = AddTaskCG(TestCGType::HOST_TASK, Events);
+  detail::Command *Cmd1 =
+      AddTaskCG(TestCGType::HOST_TASK, Events, &CustomHostLambda);
   EventImplPtr Cmd1Event = Cmd1->getEvent();
   Cmd1->MIsBlockable = true;
   Cmd1->MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueBlocked;
@@ -269,9 +270,9 @@ TEST_F(DependsOnTests, EnqueueNoMemObjDoubleKernelDepHost) {
   // kernels on host task completion
   std::vector<EventImplPtr> Events;
 
-  detail::Command *Cmd1 = AddTaskCG(TestCGType::HOST_TASK, Events);
+  detail::Command *Cmd1 =
+      AddTaskCG(TestCGType::HOST_TASK, Events, &CustomHostLambda);
   EventImplPtr Cmd1Event = Cmd1->getEvent();
-  Cmd1->MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueBlocked;
 
   // Depends on host task
   Events.push_back(Cmd1Event);

From 89e82e3f7421999938e5bb5f685b9299dbe709ff Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Fri, 16 Dec 2022 18:59:20 +0100
Subject: [PATCH 07/11] [SYCL][Fusion] Scheduler support for kernel fusion
 (#7531)

This is the third patch in a series of patches to add an implementation
of the [kernel fusion
extension](https://github.com/intel/llvm/pull/7098). We have split the
implementation into multiple patches to make them more easy to review.
This patch integrates the kernel fusion extension into the SYCL runtime
scheduler.

Next to collecting the kernels submitted while in fusion mode in the
fusion list associated with the queue, the integration into the
scheduler is also responsible for detecting the synchronization
scenarios. Various scenarios, such as buffer destruction or event wait,
require fusion to be aborted early. The full list of scenarios is
available in the [extension
proposal](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc#synchronization-in-the-sycl-application).

A high-level description of the integration into the scheduler can be
found in the [design document](https://github.com/intel/llvm/pull/7204).

This PR can be reviewed and merged independently of
https://github.com/intel/llvm/pull/7465.

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 .../detail/fusion/fusion_wrapper_impl.cpp     |  18 +-
 sycl/source/detail/helpers.cpp                |  12 +-
 sycl/source/detail/queue_impl.hpp             |   9 +
 sycl/source/detail/scheduler/commands.cpp     | 123 +++++++
 sycl/source/detail/scheduler/commands.hpp     |  53 ++-
 .../source/detail/scheduler/graph_builder.cpp | 331 +++++++++++++++++-
 .../detail/scheduler/graph_processor.cpp      |  35 +-
 sycl/source/detail/scheduler/scheduler.cpp    | 148 ++++++--
 sycl/source/detail/scheduler/scheduler.hpp    |  78 ++++-
 sycl/source/handler.cpp                       |   9 +-
 sycl/unittests/scheduler/CMakeLists.txt       |   1 +
 sycl/unittests/scheduler/KernelFusion.cpp     | 150 ++++++++
 .../scheduler/SchedulerTestUtils.hpp          |  16 +-
 13 files changed, 901 insertions(+), 82 deletions(-)
 create mode 100644 sycl/unittests/scheduler/KernelFusion.cpp

diff --git a/sycl/source/detail/fusion/fusion_wrapper_impl.cpp b/sycl/source/detail/fusion/fusion_wrapper_impl.cpp
index f959dcc8d51dc..92130f76fd2ad 100644
--- a/sycl/source/detail/fusion/fusion_wrapper_impl.cpp
+++ b/sycl/source/detail/fusion/fusion_wrapper_impl.cpp
@@ -8,6 +8,8 @@
 
 #include <detail/fusion/fusion_wrapper_impl.hpp>
 
+#include <detail/scheduler/scheduler.hpp>
+
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
@@ -20,22 +22,22 @@ std::shared_ptr<detail::queue_impl> fusion_wrapper_impl::get_queue() const {
   return MQueue;
 }
 
-bool fusion_wrapper_impl::is_in_fusion_mode() const { return false; }
+bool fusion_wrapper_impl::is_in_fusion_mode() const {
+  return MQueue->is_in_fusion_mode();
+}
 
 void fusion_wrapper_impl::start_fusion() {
-  throw sycl::exception(sycl::errc::feature_not_supported,
-                        "Fusion not yet implemented");
+  detail::Scheduler::getInstance().startFusion(MQueue);
 }
 
 void fusion_wrapper_impl::cancel_fusion() {
-  throw sycl::exception(sycl::errc::feature_not_supported,
-                        "Fusion not yet implemented");
+  detail::Scheduler::getInstance().cancelFusion(MQueue);
 }
 
 event fusion_wrapper_impl::complete_fusion(const property_list &PropList) {
-  (void)PropList;
-  throw sycl::exception(sycl::errc::feature_not_supported,
-                        "Fusion not yet implemented");
+  auto EventImpl =
+      detail::Scheduler::getInstance().completeFusion(MQueue, PropList);
+  return detail::createSyclObjFromImpl<event>(EventImpl);
 }
 
 } // namespace detail
diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp
index c605b845b0147..6c37f0bb9093d 100644
--- a/sycl/source/detail/helpers.cpp
+++ b/sycl/source/detail/helpers.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <detail/scheduler/commands.hpp>
 #include <sycl/detail/helpers.hpp>
 
 #include <detail/context_impl.hpp>
@@ -30,9 +31,16 @@ std::vector<RT::PiEvent> getOrWaitEvents(std::vector<sycl::event> DepEvents,
         !SyclEventImplPtr->is_host()) {
       continue;
     }
+    // The fusion command and its event are associated with a non-host context,
+    // but still does not produce a PI event.
+    bool NoPiEvent =
+        SyclEventImplPtr->MCommand &&
+        !static_cast<Command *>(SyclEventImplPtr->MCommand)->producesPiEvent();
     if (SyclEventImplPtr->is_host() ||
-        SyclEventImplPtr->getContextImpl() != Context) {
-      SyclEventImplPtr->waitInternal();
+        SyclEventImplPtr->getContextImpl() != Context || NoPiEvent) {
+      // Call wait, because the command for the event might not have been
+      // enqueued when kernel fusion is happening.
+      SyclEventImplPtr->wait(SyclEventImplPtr);
     } else {
       Events.push_back(SyclEventImplPtr->getHandleRef());
     }
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index e0a296a8ede3c..df1dc48e6fdc2 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -498,6 +498,15 @@ class queue_impl {
 
   bool ext_oneapi_empty() const;
 
+  /// Check whether the queue is in fusion mode.
+  ///
+  /// \return true if the queue is in fusion mode, false otherwise.
+  bool is_in_fusion_mode() {
+    return detail::Scheduler::getInstance().isInFusionMode(
+        std::hash<typename std::shared_ptr<queue_impl>::element_type *>()(
+            this));
+  }
+
 protected:
   // template is needed for proper unit testing
   template <typename HandlerType = handler>
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 78707467c1833..5ef520a6d2ed7 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -168,6 +168,8 @@ static std::string commandToNodeType(Command::CommandType Type) {
     return "host_acc_create_buffer_lock_node";
   case Command::CommandType::EMPTY_TASK:
     return "host_acc_destroy_buffer_release_node";
+  case Command::CommandType::FUSION:
+    return "kernel_fusion_placeholder_node";
   default:
     return "unknown_node";
   }
@@ -196,6 +198,8 @@ static std::string commandToName(Command::CommandType Type) {
     return "Host Accessor Creation/Buffer Lock";
   case Command::CommandType::EMPTY_TASK:
     return "Host Accessor Destruction/Buffer Lock Release";
+  case Command::CommandType::FUSION:
+    return "Kernel Fusion Placeholder";
   default:
     return "Unknown Action";
   }
@@ -2586,6 +2590,125 @@ bool ExecCGCommand::readyForCleanup() const {
     return MLeafCounter == 0 && MEvent->isCompleted();
   return Command::readyForCleanup();
 }
+
+KernelFusionCommand::KernelFusionCommand(QueueImplPtr Queue)
+    : Command(Command::CommandType::FUSION, Queue),
+      MStatus(FusionStatus::ACTIVE) {
+  emitInstrumentationDataProxy();
+}
+
+std::vector<Command *> &KernelFusionCommand::auxiliaryCommands() {
+  return MAuxiliaryCommands;
+}
+
+void KernelFusionCommand::addToFusionList(ExecCGCommand *Kernel) {
+  MFusionList.push_back(Kernel);
+}
+
+std::vector<ExecCGCommand *> &KernelFusionCommand::getFusionList() {
+  return MFusionList;
+}
+
+bool KernelFusionCommand::producesPiEvent() const { return false; }
+
+pi_int32 KernelFusionCommand::enqueueImp() {
+  waitForPreparedHostEvents();
+  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
+
+  return PI_SUCCESS;
+}
+
+void KernelFusionCommand::setFusionStatus(FusionStatus Status) {
+  MStatus = Status;
+}
+
+void KernelFusionCommand::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled()) {
+    return;
+  }
+  // Create a payload with the command name and an event using this payload to
+  // emit a node_create
+  MCommandNodeType = commandToNodeType(MType);
+  MCommandName = commandToName(MType);
+
+  static unsigned FusionNodeCount = 0;
+  std::stringstream PayloadStr;
+  PayloadStr << "Fusion command #" << FusionNodeCount++;
+  xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
+
+  uint64_t CommandInstanceNo = 0;
+  xpti_td *CmdTraceEvent =
+      xptiMakeEvent(MCommandName.c_str(), &Payload, xpti::trace_graph_event,
+                    xpti_at::active, &CommandInstanceNo);
+
+  MInstanceID = CommandInstanceNo;
+  if (CmdTraceEvent) {
+    MTraceEvent = static_cast<void *>(CmdTraceEvent);
+    // If we are seeing this event again, then the instance ID
+    // will be greater
+    // than 1; in this case, we must skip sending a
+    // notification to create a node as this node has already
+    // been created. We return this value so the epilog method
+    // can be called selectively.
+    // See makeTraceEventProlog.
+    MFirstInstance = (CommandInstanceNo == 1);
+  }
+
+  // This function is called in the constructor of the command. At this point
+  // the kernel fusion list is still empty, so we don't have a terrible lot of
+  // information we could attach to this node here.
+  if (MFirstInstance && CmdTraceEvent) {
+    xpti::addMetadata(CmdTraceEvent, "sycl_device",
+                      deviceToID(MQueue->get_device()));
+    xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
+                      deviceToString(MQueue->get_device()));
+    xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
+                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
+  }
+
+  if (MFirstInstance) {
+    xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
+                          detail::GSYCLGraphEvent,
+                          static_cast<xpti_td *>(MTraceEvent), MInstanceID,
+                          static_cast<const void *>(MCommandNodeType.c_str()));
+  }
+
+#endif
+}
+
+void KernelFusionCommand::printDot(std::ostream &Stream) const {
+  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
+
+  Stream << "ID = " << this << "\\n";
+  Stream << "KERNEL FUSION on " << deviceToString(MQueue->get_device()) << "\\n"
+         << "FUSION LIST: {";
+  bool Initial = true;
+  for (auto *Cmd : MFusionList) {
+    if (!Initial) {
+      Stream << ",\\n";
+    }
+    Initial = false;
+    auto *KernelCG = static_cast<detail::CGExecKernel *>(&Cmd->getCG());
+    if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
+      Stream << "created from source";
+    } else {
+      Stream << demangleKernelName(KernelCG->getKernelName());
+    }
+  }
+  Stream << "}\\n";
+
+  Stream << "\"];" << std::endl;
+
+  for (const auto &Dep : MDeps) {
+    Stream << "  \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
+           << " [ label = \"Access mode: "
+           << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
+           << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
+           << std::endl;
+  }
+}
+
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 423208c8ba6ad..8a1bbe7370793 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -104,7 +104,8 @@ class Command {
     UNMAP_MEM_OBJ,
     UPDATE_REQUIREMENT,
     EMPTY_TASK,
-    HOST_TASK
+    HOST_TASK,
+    FUSION
   };
 
   Command(CommandType Type, QueueImplPtr Queue);
@@ -571,6 +572,8 @@ pi_int32 enqueueImpKernel(
     std::vector<RT::PiEvent> &RawEvents, RT::PiEvent *OutEvent,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc);
 
+class KernelFusionCommand;
+
 /// The exec CG command enqueues execution of kernel or explicit memory
 /// operation.
 class ExecCGCommand : public Command {
@@ -586,6 +589,17 @@ class ExecCGCommand : public Command {
 
   detail::CG &getCG() const { return *MCommandGroup; }
 
+  // MEmptyCmd is only employed if this command refers to host-task.
+  // The mechanism of lookup for single EmptyCommand amongst users of
+  // host-task-representing command is unreliable. This unreliability roots in
+  // the cleanup process.
+  EmptyCommand *MEmptyCmd = nullptr;
+
+  // MFusionCommand is employed to mark a CG command as part of a kernel fusion
+  // and allows to refer back to the corresponding KernelFusionCommand if
+  // necessary.
+  KernelFusionCommand *MFusionCmd = nullptr;
+
   bool producesPiEvent() const final;
 
   bool supportsPostEnqueueCleanup() const final;
@@ -619,6 +633,43 @@ class UpdateHostRequirementCommand : public Command {
   void **MDstPtr = nullptr;
 };
 
+/// The KernelFusionCommand is placed in the execution graph together with the
+/// individual kernels of the fusion list to control kernel fusion.
+class KernelFusionCommand : public Command {
+public:
+  enum class FusionStatus { ACTIVE, CANCELLED, COMPLETE, DELETED };
+
+  explicit KernelFusionCommand(QueueImplPtr Queue);
+
+  void printDot(std::ostream &Stream) const final;
+  void emitInstrumentationData() final;
+  bool producesPiEvent() const final;
+
+  std::vector<Command *> &auxiliaryCommands();
+
+  void addToFusionList(ExecCGCommand *Kernel);
+
+  std::vector<ExecCGCommand *> &getFusionList();
+
+  ///
+  /// Set the status of this fusion command to \p Status. This function should
+  /// only be called under the protection of the scheduler write-lock.
+  void setFusionStatus(FusionStatus Status);
+
+  bool isActive() const { return MStatus == FusionStatus::ACTIVE; }
+
+  bool readyForDeletion() const { return MStatus == FusionStatus::DELETED; }
+
+private:
+  pi_int32 enqueueImp() final;
+
+  std::vector<ExecCGCommand *> MFusionList;
+
+  std::vector<Command *> MAuxiliaryCommands;
+
+  FusionStatus MStatus;
+};
+
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 5bdcaade2f5ce..d3ee02d6e1f17 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -109,6 +109,12 @@ Scheduler::GraphBuilder::GraphBuilder() {
     if (GraphPrintOpts.find("after_addHostAcc") != std::string::npos ||
         EnableAlways)
       MPrintOptionsArray[AfterAddHostAcc] = true;
+    if (GraphPrintOpts.find("after_fusionComplete") != std::string::npos ||
+        EnableAlways)
+      MPrintOptionsArray[AfterFusionComplete] = true;
+    if (GraphPrintOpts.find("after_fusionCancel") != std::string::npos ||
+        EnableAlways)
+      MPrintOptionsArray[AfterFusionCancel] = true;
   }
 }
 
@@ -129,6 +135,14 @@ static void unmarkVisitedNodes(std::vector<Command *> &Visited) {
 static void handleVisitedNodes(std::vector<Command *> &Visited) {
   for (Command *Cmd : Visited) {
     if (Cmd->MMarks.MToBeDeleted) {
+      if (Cmd->getType() == Command::FUSION &&
+          !static_cast<KernelFusionCommand *>(Cmd)->readyForDeletion()) {
+        // Fusion commands might still be needed because fusion might be
+        // aborted, but a later call to complete_fusion still needs to be able
+        // to return a valid event. Clean-up of fusion commands is therefore
+        // explicitly handled by start fusion.
+        return;
+      }
       Cmd->getEvent()->setCommand(nullptr);
       delete Cmd;
     } else
@@ -884,7 +898,7 @@ Scheduler::GraphBuilder::addEmptyCmd(Command *Cmd, const std::vector<T *> &Reqs,
   return EmptyCmd;
 }
 
-static bool isInteropHostTask(const std::unique_ptr<ExecCGCommand> &Cmd) {
+static bool isInteropHostTask(ExecCGCommand *Cmd) {
   if (Cmd->getCG().getType() != CG::CGTYPE::CodeplayHostTask)
     return false;
 
@@ -914,7 +928,7 @@ static void combineAccessModesOfReqs(std::vector<Requirement *> &Reqs) {
   }
 }
 
-Command *
+Scheduler::GraphBuildResult
 Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
                                const QueueImplPtr &Queue,
                                std::vector<Command *> &ToEnqueue) {
@@ -925,6 +939,74 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
   if (!NewCmd)
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
 
+  // Host tasks cannot participate in fusion. They take the regular route. If
+  // they create any requirement or event dependency on any of the kernels in
+  // the fusion list, this will lead to cancellation of the fusion in the
+  // GraphProcessor.
+  auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
+  if (isInFusionMode(QUniqueID) && !NewCmd->isHostTask()) {
+    auto *FusionCmd = findFusionList(QUniqueID)->second.get();
+    FusionCmd->addToFusionList(NewCmd.get());
+    // Add the kernel to the graph, but delay the enqueue of any auxiliary
+    // commands (e.g., allocations) resulting from that process by adding them
+    // to the list of auxiliary commands of the fusion command.
+    createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
+                          isInteropHostTask(NewCmd.get()), Reqs, Events, Queue,
+                          FusionCmd->auxiliaryCommands());
+    // We need to check the commands that this kernel depends on for any other
+    // commands that have been submitted to another queue which is also in
+    // fusion mode. If we detect such another command, we cancel fusion for that
+    // other queue to avoid circular dependencies.
+    // Handle requirements on any commands part of another active fusion.
+    for (auto &Dep : NewCmd->MDeps) {
+      auto *DepCmd = Dep.MDepCommand;
+      if (!DepCmd) {
+        continue;
+      }
+      if (DepCmd->getQueue() != Queue && isPartOfActiveFusion(DepCmd)) {
+        printFusionWarning("Aborting fusion because of requirement from a "
+                           "different fusion process");
+        cancelFusion(DepCmd->getQueue(), ToEnqueue);
+      }
+    }
+    // Handle event dependencies on any commands part of another active fusion.
+    for (auto &Ev : Events) {
+      auto *EvDepCmd = static_cast<Command *>(Ev->getCommand());
+      if (!EvDepCmd) {
+        continue;
+      }
+      if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
+        printFusionWarning("Aborting fusion because of event dependency from a "
+                           "different fusion");
+        cancelFusion(EvDepCmd->getQueue(), ToEnqueue);
+      }
+    }
+
+    // Set the fusion command, so we recognize when another command depends on a
+    // kernel in the fusion list.
+    NewCmd->MFusionCmd = FusionCmd;
+    std::vector<Command *> ToCleanUp;
+    // Add an event dependency from the fusion placeholder command to the new
+    // kernel.
+    auto ConnectionCmd = FusionCmd->addDep(NewCmd->getEvent(), ToCleanUp);
+    if (ConnectionCmd) {
+      FusionCmd->auxiliaryCommands().push_back(ConnectionCmd);
+    }
+    return {NewCmd.release(), FusionCmd->getEvent(), false};
+  }
+  createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
+                        isInteropHostTask(NewCmd.get()), Reqs, Events, Queue,
+                        ToEnqueue);
+  auto Event = NewCmd->getEvent();
+  return {NewCmd.release(), Event, true};
+}
+
+void Scheduler::GraphBuilder::createGraphForCommand(
+    Command *NewCmd, CG &CG, bool isInteropTask,
+    std::vector<Requirement *> &Reqs,
+    const std::vector<detail::EventImplPtr> &Events, QueueImplPtr Queue,
+    std::vector<Command *> &ToEnqueue) {
+
   if (MPrintOptionsArray[BeforeAddCG])
     printGraphAsDot("before_addCG");
 
@@ -941,9 +1023,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
 
     {
       const QueueImplPtr &QueueForAlloca =
-          isInteropHostTask(NewCmd)
-              ? static_cast<detail::CGHostTask &>(NewCmd->getCG()).MQueue
-              : Queue;
+          isInteropTask ? static_cast<detail::CGHostTask &>(CG).MQueue : Queue;
 
       Record = getOrInsertMemObjRecord(QueueForAlloca, Req, ToEnqueue);
       markModifiedIfWrite(Record, Req);
@@ -969,9 +1049,8 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
       bool NeedMemMoveToHost = false;
       auto MemMoveTargetQueue = Queue;
 
-      if (isInteropHostTask(NewCmd)) {
-        const detail::CGHostTask &HT =
-            static_cast<detail::CGHostTask &>(NewCmd->getCG());
+      if (isInteropTask) {
+        const detail::CGHostTask &HT = static_cast<detail::CGHostTask &>(CG);
 
         if (HT.MQueue->getContextImplPtr() != Record->MCurContext) {
           NeedMemMoveToHost = true;
@@ -990,10 +1069,12 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
         findDepsForReq(Record, Req, Queue->getContextImplPtr());
 
     for (Command *Dep : Deps) {
-      Command *ConnCmd =
-          NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
-      if (ConnCmd)
-        ToEnqueue.push_back(ConnCmd);
+      if (Dep != NewCmd) {
+        Command *ConnCmd =
+            NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
+        if (ConnCmd)
+          ToEnqueue.push_back(ConnCmd);
+      }
     }
   }
 
@@ -1006,11 +1087,14 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
     const Requirement *Req = Dep.MDepRequirement;
     MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
     updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
-    addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode, ToEnqueue);
+    addNodeToLeaves(Record, NewCmd, Req->MAccessMode, ToEnqueue);
   }
 
   // Register all the events as dependencies
   for (detail::EventImplPtr e : Events) {
+    if (e->getCommand() && e->getCommand() == NewCmd) {
+      continue;
+    }
     if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp))
       ToEnqueue.push_back(ConnCmd);
   }
@@ -1018,9 +1102,9 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
   if (MPrintOptionsArray[AfterAddCG])
     printGraphAsDot("after_addCG");
 
-  for (Command *Cmd : ToCleanUp)
+  for (Command *Cmd : ToCleanUp) {
     cleanupCommand(Cmd);
-  return NewCmd.release();
+  }
 }
 
 void Scheduler::GraphBuilder::decrementLeafCountersForRecord(
@@ -1126,10 +1210,12 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
   handleVisitedNodes(MVisitedCmds);
 }
 
-void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) {
+void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd,
+                                             bool AllowUnsubmitted) {
   if (SYCLConfig<SYCL_DISABLE_POST_ENQUEUE_CLEANUP>::get())
     return;
-  assert(Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued());
+  assert(Cmd->MLeafCounter == 0 &&
+         (Cmd->isSuccessfullyEnqueued() || AllowUnsubmitted));
   Command::CommandType CmdT = Cmd->getType();
 
   assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF);
@@ -1156,6 +1242,14 @@ void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) {
     DepCmd->MUsers.erase(Cmd);
   }
 
+  if (Cmd->getType() == Command::FUSION &&
+      !static_cast<KernelFusionCommand *>(Cmd)->readyForDeletion()) {
+    // Fusion commands might still be needed because fusion might be aborted,
+    // but a later call to complete_fusion still needs to be able to return a
+    // valid event. Clean-up of fusion commands is therefore explicitly handled
+    // by start fusion.
+    return;
+  }
   Cmd->getEvent()->setCommand(nullptr);
   delete Cmd;
 }
@@ -1236,6 +1330,209 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
   return ConnectCmd;
 }
 
+void Scheduler::GraphBuilder::startFusion(QueueImplPtr Queue) {
+  auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
+  if (isInFusionMode(QUniqueID)) {
+    throw sycl::exception{sycl::make_error_code(sycl::errc::invalid),
+                          "Queue already in fusion mode"};
+  }
+  auto OldFusionCmd = findFusionList(QUniqueID);
+  if (OldFusionCmd != MFusionMap.end()) {
+    // If fusion was used on this queue previously, the old fusion command might
+    // still be around to make sure that even after
+    // cancellation of the fusion due to synchronization, complete_fusion is
+    // still able to return a valid event.
+    OldFusionCmd->second->setFusionStatus(
+        KernelFusionCommand::FusionStatus::DELETED);
+    cleanupCommand(OldFusionCmd->second.release());
+    MFusionMap.erase(OldFusionCmd);
+  }
+  MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
+}
+
+void Scheduler::GraphBuilder::removeNodeFromGraph(
+    Command *Node, std::vector<Command *> &ToEnqueue) {
+  // Remove the placeholder command as leaf of all its requirements and from the
+  // user list of all its dependencies.
+  for (auto &Dep : Node->MDeps) {
+    auto AccessMode = Dep.MDepRequirement->MAccessMode;
+    auto *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj);
+
+    Node->MLeafCounter -= Record->MReadLeaves.remove(Node);
+    Node->MLeafCounter -= Record->MWriteLeaves.remove(Node);
+    // If the placeholder had a write-requirement on this record, we need to
+    // restore the previous leaves.
+    if (AccessMode != access::mode::read) {
+      for (auto PrevDep : Dep.MDepCommand->MDeps) {
+        auto *DepReq = PrevDep.MDepRequirement;
+        auto *DepRecord = getMemObjRecord(DepReq->MSYCLMemObj);
+        if (DepRecord == Record) {
+          // Need to restore this as a leaf, because we pushed it from the
+          // leaves when adding the placeholder command.
+          assert(Dep.MDepCommand);
+          addNodeToLeaves(Record, Dep.MDepCommand, DepReq->MAccessMode,
+                          ToEnqueue);
+        }
+      }
+    }
+    Dep.MDepCommand->MUsers.erase(Node);
+  }
+
+  Node->MDeps.clear();
+}
+
+void Scheduler::GraphBuilder::cancelFusion(QueueImplPtr Queue,
+                                           std::vector<Command *> &ToEnqueue) {
+  auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
+  if (!isInFusionMode(QUniqueID)) {
+    return;
+  }
+  auto FusionList = findFusionList(QUniqueID);
+
+  auto *PlaceholderCmd = (*FusionList).second.get();
+
+  // Enqueue all the kernels/commands from the fusion list
+  auto FusedCmdList = PlaceholderCmd->getFusionList();
+  ToEnqueue.insert(ToEnqueue.end(), FusedCmdList.begin(), FusedCmdList.end());
+
+  // The commands establishing an event dependency between the fusion
+  // placeholder command and the individual kernels need to be enqueued.
+  ToEnqueue.insert(ToEnqueue.end(), PlaceholderCmd->auxiliaryCommands().begin(),
+                   PlaceholderCmd->auxiliaryCommands().end());
+
+  ToEnqueue.push_back(PlaceholderCmd);
+
+  if (MPrintOptionsArray[AfterFusionCancel]) {
+    printGraphAsDot("after_fusionCancel");
+  }
+
+  // Set the status for the fusion command
+  PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::CANCELLED);
+}
+
+EventImplPtr
+Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue,
+                                        std::vector<Command *> &ToEnqueue,
+                                        const property_list &PropList) {
+  auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
+#if SYCL_EXT_CODEPLAY_KERNEL_FUSION
+  if (!isInFusionMode(QUniqueID)) {
+    auto InactiveFusionList = findFusionList(QUniqueID);
+    if (InactiveFusionList == MFusionMap.end()) {
+      throw sycl::exception{
+          sycl::make_error_code(sycl::errc::invalid),
+          "Calling complete_fusion on a queue not in fusion mode"};
+    }
+    return InactiveFusionList->second->getEvent();
+  }
+
+  auto FusionList = findFusionList(QUniqueID);
+  auto *PlaceholderCmd = FusionList->second.get();
+  auto &CmdList = PlaceholderCmd->getFusionList();
+
+  // TODO: The logic to invoke the JIT compiler to create a fused kernel from
+  // the list will be added in a later PR.
+  auto FusedCG = nullptr;
+
+  if (!FusedCG) {
+    // If the JIT compiler returns a nullptr, JIT compilation of the fused
+    // kernel failed. In that case, simply cancel the fusion and run each kernel
+    // on its own.
+    auto LastEvent = PlaceholderCmd->getEvent();
+    this->cancelFusion(Queue, ToEnqueue);
+    return LastEvent;
+  }
+
+  // Inherit all event dependencies from the input commands in the fusion list.
+  std::vector<EventImplPtr> FusedEventDeps;
+  for (auto *Cmd : CmdList) {
+    FusedEventDeps.insert(FusedEventDeps.end(),
+                          Cmd->getPreparedDepsEvents().begin(),
+                          Cmd->getPreparedDepsEvents().end());
+    FusedEventDeps.insert(FusedEventDeps.end(),
+                          Cmd->getPreparedHostDepsEvents().begin(),
+                          Cmd->getPreparedHostDepsEvents().end());
+  }
+
+  // Remove internal explicit dependencies, i.e., explicit dependencies from one
+  // kernel in the fusion list to another kernel also in the fusion list.
+  FusedEventDeps.erase(
+      std::remove_if(FusedEventDeps.begin(), FusedEventDeps.end(),
+                     [&](EventImplPtr &E) {
+                       if (E->getCommand() == PlaceholderCmd) {
+                         return true;
+                       }
+                       if (E->getCommand() &&
+                           static_cast<Command *>(E->getCommand())->getType() ==
+                               Command::RUN_CG) {
+                         auto *RunCGCmd =
+                             static_cast<ExecCGCommand *>(E->getCommand());
+                         if (RunCGCmd->MFusionCmd == PlaceholderCmd) {
+                           return true;
+                         }
+                       }
+                       return false;
+                     }),
+      FusedEventDeps.end());
+
+  auto FusedKernelCmd =
+      std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
+
+  assert(PlaceholderCmd->MDeps.empty());
+  // Next, backwards iterate over all the commands in the fusion list and remove
+  // them from the graph to restore the state before starting fusion, so we can
+  // add the fused kernel to the graph in the next step.
+  // Clean up the old commands after successfully fusing them.
+  for (auto OldCmd = CmdList.rbegin(); OldCmd != CmdList.rend(); ++OldCmd) {
+    removeNodeFromGraph(*OldCmd, ToEnqueue);
+    cleanupCommand(*OldCmd, /* AllowUnsubmitted */ true);
+  }
+
+  createGraphForCommand(FusedKernelCmd.get(), FusedKernelCmd->getCG(), false,
+                        FusedKernelCmd->getCG().MRequirements, FusedEventDeps,
+                        Queue, ToEnqueue);
+
+  ToEnqueue.push_back(FusedKernelCmd.get());
+
+  std::vector<Command *> ToCleanUp;
+  // Make the placeholder command depend on the execution of the fused kernel
+  auto *ConnectToPlaceholder =
+      PlaceholderCmd->addDep(FusedKernelCmd->getEvent(), ToCleanUp);
+  if (ConnectToPlaceholder) {
+    ToEnqueue.push_back(ConnectToPlaceholder);
+  }
+  for (Command *Cmd : ToCleanUp) {
+    cleanupCommand(Cmd);
+  }
+  ToEnqueue.push_back(PlaceholderCmd);
+
+  if (MPrintOptionsArray[AfterFusionComplete]) {
+    printGraphAsDot("after_fusionComplete");
+  }
+
+  // Set the status for the fusion command.
+  PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::COMPLETE);
+
+  return FusedKernelCmd.release()->getEvent();
+#else  // SYCL_EXT_CODEPLAY_KERNEL_FUSION
+  printFusionWarning("Kernel fusion not supported by this build");
+  (void)PropList;
+  auto FusionList = findFusionList(QUniqueID);
+  auto *PlaceholderCmd = FusionList->second.get();
+  auto LastEvent = PlaceholderCmd->getEvent();
+  this->cancelFusion(Queue, ToEnqueue);
+  return LastEvent;
+#endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
+}
+
+bool Scheduler::GraphBuilder::isInFusionMode(QueueIdT Id) {
+  auto FusionList = findFusionList(Id);
+  if (FusionList == MFusionMap.end()) {
+    return false;
+  }
+  return FusionList->second->isActive();
+}
+
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp
index 8849ad8d5f3a4..cecdf9fb1063a 100644
--- a/sycl/source/detail/scheduler/graph_processor.cpp
+++ b/sycl/source/detail/scheduler/graph_processor.cpp
@@ -32,7 +32,8 @@ void Scheduler::GraphProcessor::waitForEvent(const EventImplPtr &Event,
     return;
 
   EnqueueResultT Res;
-  bool Enqueued = enqueueCommand(Cmd, Res, ToCleanUp, Cmd, BLOCKING);
+  bool Enqueued =
+      enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd, BLOCKING);
   if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
     // TODO: Reschedule commands.
     throw runtime_error("Enqueue process failed.", PI_ERROR_INVALID_OPERATION);
@@ -65,7 +66,7 @@ bool Scheduler::GraphProcessor::handleBlockingCmd(Command *Cmd,
 }
 
 bool Scheduler::GraphProcessor::enqueueCommand(
-    Command *Cmd, EnqueueResultT &EnqueueResult,
+    Command *Cmd, ReadLockT &GraphReadLock, EnqueueResultT &EnqueueResult,
     std::vector<Command *> &ToCleanUp, Command *RootCommand,
     BlockingT Blocking) {
   if (!Cmd)
@@ -73,6 +74,28 @@ bool Scheduler::GraphProcessor::enqueueCommand(
   if (Cmd->isSuccessfullyEnqueued())
     return handleBlockingCmd(Cmd, EnqueueResult, RootCommand, Blocking);
 
+  if (KernelFusionCommand *FusionCmd = isPartOfActiveFusion(Cmd)) {
+    // The fusion is still in-flight, but some other event/command depending
+    // on one of the kernels in the fusion list has triggered it to be
+    // enqueued. To avoid circular dependencies and deadlocks, we will need to
+    // cancel fusion here and enqueue the kernels in the fusion list right
+    // away.
+    printFusionWarning("Aborting fusion because synchronization with one of "
+                       "the kernels in the fusion list was requested");
+    // We need to unlock the read lock, as cancelFusion in the scheduler will
+    // acquire a write lock to alter the graph.
+    GraphReadLock.unlock();
+    // Cancel fusion will take care of enqueueing all the kernels.
+    Scheduler::getInstance().cancelFusion(FusionCmd->getQueue());
+    // Lock the read lock again.
+    GraphReadLock.lock();
+    // The fusion (placeholder) command should have been enqueued by
+    // cancelFusion.
+    if (FusionCmd->isSuccessfullyEnqueued()) {
+      return true;
+    }
+  }
+
   // Exit early if the command is blocked and the enqueue type is non-blocking
   if (Cmd->isEnqueueBlocked() && !Blocking) {
     EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, Cmd);
@@ -83,8 +106,8 @@ bool Scheduler::GraphProcessor::enqueueCommand(
   // first and exit immediately if any of the commands cannot be enqueued.
   for (const EventImplPtr &Event : Cmd->getPreparedDepsEvents()) {
     if (Command *DepCmd = static_cast<Command *>(Event->getCommand()))
-      if (!enqueueCommand(DepCmd, EnqueueResult, ToCleanUp, RootCommand,
-                          Blocking))
+      if (!enqueueCommand(DepCmd, GraphReadLock, EnqueueResult, ToCleanUp,
+                          RootCommand, Blocking))
         return false;
   }
 
@@ -96,8 +119,8 @@ bool Scheduler::GraphProcessor::enqueueCommand(
   // completion stage and eliminate this event waiting in enqueue.
   for (const EventImplPtr &Event : Cmd->getPreparedHostDepsEvents()) {
     if (Command *DepCmd = static_cast<Command *>(Event->getCommand()))
-      if (!enqueueCommand(DepCmd, EnqueueResult, ToCleanUp, RootCommand,
-                          Blocking))
+      if (!enqueueCommand(DepCmd, GraphReadLock, EnqueueResult, ToCleanUp,
+                          RootCommand, Blocking))
         return false;
   }
 
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 171cd0aae5eee..e2e7f5df48cee 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -46,7 +46,8 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record,
   std::vector<Command *> ToCleanUp;
   for (Command *Cmd : Record->MReadLeaves) {
     EnqueueResultT Res;
-    bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp, Cmd);
+    bool Enqueued =
+        GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.",
                           PI_ERROR_INVALID_OPERATION);
@@ -58,7 +59,8 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record,
   }
   for (Command *Cmd : Record->MWriteLeaves) {
     EnqueueResultT Res;
-    bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp, Cmd);
+    bool Enqueued =
+        GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.",
                           PI_ERROR_INVALID_OPERATION);
@@ -70,8 +72,8 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record,
   for (AllocaCommandBase *AllocaCmd : Record->MAllocaCommands) {
     Command *ReleaseCmd = AllocaCmd->getReleaseCmd();
     EnqueueResultT Res;
-    bool Enqueued =
-        GraphProcessor::enqueueCommand(ReleaseCmd, Res, ToCleanUp, ReleaseCmd);
+    bool Enqueued = GraphProcessor::enqueueCommand(ReleaseCmd, GraphReadLock,
+                                                   Res, ToCleanUp, ReleaseCmd);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.",
                           PI_ERROR_INVALID_OPERATION);
@@ -109,6 +111,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr<detail::CG> CommandGroup,
     }
   }
 
+  bool ShouldEnqueue = true;
   {
     WriteLockT Lock = acquireWriteLock();
 
@@ -117,36 +120,62 @@ EventImplPtr Scheduler::addCG(std::unique_ptr<detail::CG> CommandGroup,
     case CG::UpdateHost:
       NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup),
                                              DefaultHostQueue, AuxiliaryCmds);
+      NewEvent = NewCmd->getEvent();
       break;
-    case CG::CodeplayHostTask:
-      NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), DefaultHostQueue,
-                                   AuxiliaryCmds);
+    case CG::CodeplayHostTask: {
+      auto Result = MGraphBuilder.addCG(std::move(CommandGroup),
+                                        DefaultHostQueue, AuxiliaryCmds);
+      NewCmd = Result.NewCmd;
+      NewEvent = Result.NewEvent;
+      ShouldEnqueue = Result.ShouldEnqueue;
       break;
+    }
     default:
-      NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), std::move(Queue),
-                                   AuxiliaryCmds);
+      auto Result = MGraphBuilder.addCG(std::move(CommandGroup),
+                                        std::move(Queue), AuxiliaryCmds);
+      NewCmd = Result.NewCmd;
+      NewEvent = Result.NewEvent;
+      ShouldEnqueue = Result.ShouldEnqueue;
     }
-    NewEvent = NewCmd->getEvent();
   }
 
+  if (ShouldEnqueue) {
+    enqueueCommandForCG(NewEvent, AuxiliaryCmds);
+
+    for (auto StreamImplPtr : Streams) {
+      StreamImplPtr->flush(NewEvent);
+    }
+
+    if (!AuxiliaryResources.empty())
+      registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
+  }
+
+  return NewEvent;
+}
+
+void Scheduler::enqueueCommandForCG(EventImplPtr NewEvent,
+                                    std::vector<Command *> &AuxiliaryCmds) {
   std::vector<Command *> ToCleanUp;
   {
     ReadLockT Lock = acquireReadLock();
 
-    Command *NewCmd = static_cast<Command *>(NewEvent->getCommand());
+    Command *NewCmd =
+        (NewEvent) ? static_cast<Command *>(NewEvent->getCommand()) : nullptr;
 
     EnqueueResultT Res;
     bool Enqueued;
 
     auto CleanUp = [&]() {
       if (NewCmd && (NewCmd->MDeps.size() == 0 && NewCmd->MUsers.size() == 0)) {
-        NewEvent->setCommand(nullptr);
+        if (NewEvent) {
+          NewEvent->setCommand(nullptr);
+        }
         delete NewCmd;
       }
     };
 
     for (Command *Cmd : AuxiliaryCmds) {
-      Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp, Cmd);
+      Enqueued = GraphProcessor::enqueueCommand(Cmd, Lock, Res, ToCleanUp, Cmd);
       try {
         if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
           throw runtime_error("Auxiliary enqueue process failed.",
@@ -163,8 +192,8 @@ EventImplPtr Scheduler::addCG(std::unique_ptr<detail::CG> CommandGroup,
       // TODO: Check if lazy mode.
       EnqueueResultT Res;
       try {
-        bool Enqueued =
-            GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp, NewCmd);
+        bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Lock, Res,
+                                                       ToCleanUp, NewCmd);
         if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
           throw runtime_error("Enqueue process failed.",
                               PI_ERROR_INVALID_OPERATION);
@@ -177,14 +206,6 @@ EventImplPtr Scheduler::addCG(std::unique_ptr<detail::CG> CommandGroup,
     }
   }
   cleanupCommands(ToCleanUp);
-
-  for (auto StreamImplPtr : Streams) {
-    StreamImplPtr->flush(NewEvent);
-  }
-  if (!AuxiliaryResources.empty())
-    registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
-
-  return NewEvent;
 }
 
 EventImplPtr Scheduler::addCopyBack(Requirement *Req) {
@@ -206,13 +227,14 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) {
     bool Enqueued;
 
     for (Command *Cmd : AuxiliaryCmds) {
-      Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp, Cmd);
+      Enqueued = GraphProcessor::enqueueCommand(Cmd, Lock, Res, ToCleanUp, Cmd);
       if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
         throw runtime_error("Enqueue process failed.",
                             PI_ERROR_INVALID_OPERATION);
     }
 
-    Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp, NewCmd);
+    Enqueued =
+        GraphProcessor::enqueueCommand(NewCmd, Lock, Res, ToCleanUp, NewCmd);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.",
                           PI_ERROR_INVALID_OPERATION);
@@ -286,14 +308,15 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req) {
     bool Enqueued;
 
     for (Command *Cmd : AuxiliaryCmds) {
-      Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp, Cmd);
+      Enqueued = GraphProcessor::enqueueCommand(Cmd, Lock, Res, ToCleanUp, Cmd);
       if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
         throw runtime_error("Enqueue process failed.",
                             PI_ERROR_INVALID_OPERATION);
     }
 
     if (Command *NewCmd = static_cast<Command *>(NewCmdEvent->getCommand())) {
-      Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp, NewCmd);
+      Enqueued =
+          GraphProcessor::enqueueCommand(NewCmd, Lock, Res, ToCleanUp, NewCmd);
       if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
         throw runtime_error("Enqueue process failed.",
                             PI_ERROR_INVALID_OPERATION);
@@ -315,18 +338,20 @@ void Scheduler::releaseHostAccessor(Requirement *Req) {
 
     BlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady;
 
-    enqueueLeavesOfReqUnlocked(Req, ToCleanUp);
+    enqueueLeavesOfReqUnlocked(Req, Lock, ToCleanUp);
   }
   cleanupCommands(ToCleanUp);
 }
 
 void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req,
+                                           ReadLockT &GraphReadLock,
                                            std::vector<Command *> &ToCleanUp) {
   MemObjRecord *Record = Req->MSYCLMemObj->MRecord.get();
-  auto EnqueueLeaves = [&ToCleanUp](LeavesCollection &Leaves) {
+  auto EnqueueLeaves = [&ToCleanUp, &GraphReadLock](LeavesCollection &Leaves) {
     for (Command *Cmd : Leaves) {
       EnqueueResultT Res;
-      bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp, Cmd);
+      bool Enqueued = GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res,
+                                                     ToCleanUp, Cmd);
       if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
         throw runtime_error("Enqueue process failed.",
                             PI_ERROR_INVALID_OPERATION);
@@ -338,14 +363,15 @@ void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req,
 }
 
 void Scheduler::enqueueUnblockedCommands(
-    const std::vector<EventImplPtr> &ToEnqueue,
+    const std::vector<EventImplPtr> &ToEnqueue, ReadLockT &GraphReadLock,
     std::vector<Command *> &ToCleanUp) {
   for (auto &Event : ToEnqueue) {
     Command *Cmd = static_cast<Command *>(Event->getCommand());
     if (!Cmd)
       continue;
     EnqueueResultT Res;
-    bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp, Cmd);
+    bool Enqueued =
+        GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.",
                           PI_ERROR_INVALID_OPERATION);
@@ -450,7 +476,7 @@ void Scheduler::NotifyHostTaskCompletion(Command *Cmd) {
       // update self-event status
       Cmd->getEvent()->setComplete();
     }
-    Scheduler::enqueueUnblockedCommands(Cmd->MBlockedUsers, ToCleanUp);
+    Scheduler::enqueueUnblockedCommands(Cmd->MBlockedUsers, Lock, ToCleanUp);
   }
   cleanupCommands(ToCleanUp);
 }
@@ -540,6 +566,62 @@ void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) {
 
 thread_local bool Scheduler::ForceDeferredMemObjRelease = false;
 
+void Scheduler::startFusion(QueueImplPtr Queue) {
+  WriteLockT Lock = acquireWriteLock();
+  MGraphBuilder.startFusion(Queue);
+}
+
+void Scheduler::cancelFusion(QueueImplPtr Queue) {
+  std::vector<Command *> ToEnqueue;
+  {
+    WriteLockT Lock = acquireWriteLock();
+    MGraphBuilder.cancelFusion(Queue, ToEnqueue);
+  }
+  enqueueCommandForCG(nullptr, ToEnqueue);
+}
+
+EventImplPtr Scheduler::completeFusion(QueueImplPtr Queue,
+                                       const property_list &PropList) {
+  std::vector<Command *> ToEnqueue;
+  EventImplPtr FusedEvent;
+  {
+    WriteLockT Lock = acquireWriteLock();
+    FusedEvent = MGraphBuilder.completeFusion(Queue, ToEnqueue, PropList);
+  }
+  enqueueCommandForCG(nullptr, ToEnqueue);
+
+  return FusedEvent;
+}
+
+bool Scheduler::isInFusionMode(QueueIdT queue) {
+  ReadLockT Lock = acquireReadLock();
+  return MGraphBuilder.isInFusionMode(queue);
+}
+
+void Scheduler::printFusionWarning(const std::string &Message) {
+  if (detail::SYCLConfig<detail::SYCL_RT_WARNING_LEVEL>::get() > 0) {
+    std::cerr << "WARNING: " << Message << "\n";
+  }
+}
+
+KernelFusionCommand *Scheduler::isPartOfActiveFusion(Command *Cmd) {
+  auto CmdType = Cmd->getType();
+  switch (CmdType) {
+  case Command::FUSION: {
+    auto *FusionCmd = static_cast<KernelFusionCommand *>(Cmd);
+    return (FusionCmd->isActive()) ? FusionCmd : nullptr;
+  }
+  case Command::RUN_CG: {
+    auto *CGCmd = static_cast<ExecCGCommand *>(Cmd);
+    return (CGCmd->MFusionCmd && CGCmd->MFusionCmd->isActive())
+               ? CGCmd->MFusionCmd
+               : nullptr;
+  }
+  default:
+    return nullptr;
+  }
+}
+
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 1829199eb8fd2..5026babe712f8 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -184,6 +184,11 @@ using EventImplPtr = std::shared_ptr<detail::event_impl>;
 using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
 using StreamImplPtr = std::shared_ptr<detail::stream_impl>;
 
+using QueueIdT = std::hash<std::shared_ptr<detail::queue_impl>>::result_type;
+using CommandPtr = std::unique_ptr<Command>;
+using FusionList = std::unique_ptr<KernelFusionCommand>;
+using FusionMap = std::unordered_map<QueueIdT, FusionList>;
+
 /// Memory Object Record
 ///
 /// The MemObjRecord is used in command groups (todo better desc).
@@ -435,6 +440,14 @@ class Scheduler {
 
   void deferMemObjRelease(const std::shared_ptr<detail::SYCLMemObjI> &MemObj);
 
+  void startFusion(QueueImplPtr Queue);
+
+  void cancelFusion(QueueImplPtr Queue);
+
+  EventImplPtr completeFusion(QueueImplPtr Queue, const property_list &);
+
+  bool isInFusionMode(QueueIdT Queue);
+
   Scheduler();
   ~Scheduler();
   void releaseResources();
@@ -472,16 +485,29 @@ class Scheduler {
 
   void NotifyHostTaskCompletion(Command *Cmd);
 
+  void enqueueCommandForCG(EventImplPtr NewEvent,
+                           std::vector<Command *> &AuxilaryCmds);
+
   static void enqueueLeavesOfReqUnlocked(const Requirement *const Req,
+                                         ReadLockT &GraphReadLock,
                                          std::vector<Command *> &ToCleanUp);
 
   static void
   enqueueUnblockedCommands(const std::vector<EventImplPtr> &CmdsToEnqueue,
+                           ReadLockT &GraphReadLock,
                            std::vector<Command *> &ToCleanUp);
 
   // May lock graph with read and write modes during execution.
   void cleanupDeferredMemObjects(BlockingT Blocking);
 
+  // POD struct to convey some additional information from GraphBuilder::addCG
+  // to the Scheduler to support kernel fusion.
+  struct GraphBuildResult {
+    Command *NewCmd;
+    EventImplPtr NewEvent;
+    bool ShouldEnqueue;
+  };
+
   void registerAuxiliaryResources(
       EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources);
   void cleanupAuxiliaryResources(BlockingT Blocking);
@@ -500,10 +526,12 @@ class Scheduler {
     ///
     /// \sa queue::submit, Scheduler::addCG
     ///
-    /// \return a command that represents command group execution.
-    Command *addCG(std::unique_ptr<detail::CG> CommandGroup,
-                   const QueueImplPtr &Queue,
-                   std::vector<Command *> &ToEnqueue);
+    /// \return a command that represents command group execution and a bool
+    /// indicating whether this command should be enqueued to the graph
+    /// processor right away or not.
+    GraphBuildResult addCG(std::unique_ptr<detail::CG> CommandGroup,
+                           const QueueImplPtr &Queue,
+                           std::vector<Command *> &ToEnqueue);
 
     /// Registers a \ref CG "command group" that updates host memory to the
     /// latest state.
@@ -531,7 +559,7 @@ class Scheduler {
     /// with Event passed and its dependencies.
     void optimize(const EventImplPtr &Event);
 
-    void cleanupCommand(Command *Cmd);
+    void cleanupCommand(Command *Cmd, bool AllowUnsubmitted = false);
 
     /// Reschedules the command passed using Queue provided.
     ///
@@ -583,6 +611,16 @@ class Scheduler {
                              const DepDesc &Dep,
                              std::vector<Command *> &ToCleanUp);
 
+    void startFusion(QueueImplPtr Queue);
+
+    void cancelFusion(QueueImplPtr Queue, std::vector<Command *> &ToEnqueue);
+
+    EventImplPtr completeFusion(QueueImplPtr Queue,
+                                std::vector<Command *> &ToEnqueue,
+                                const property_list &);
+
+    bool isInFusionMode(QueueIdT queue);
+
     std::vector<SYCLMemObjI *> MMemObjs;
 
   private:
@@ -624,6 +662,12 @@ class Scheduler {
                 std::vector<Command *> &ToEnqueue,
                 const bool AddDepsToLeaves = true);
 
+    void createGraphForCommand(Command *NewCmd, CG &CG, bool isInteropTask,
+                               std::vector<Requirement *> &Reqs,
+                               const std::vector<detail::EventImplPtr> &Events,
+                               QueueImplPtr Queue,
+                               std::vector<Command *> &ToEnqueue);
+
   protected:
     /// Finds a command dependency corresponding to the record.
     DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record);
@@ -649,10 +693,22 @@ class Scheduler {
 
     void markModifiedIfWrite(MemObjRecord *Record, Requirement *Req);
 
-    /// Used to track commands that need to be visited during graph traversal.
+    FusionMap::iterator findFusionList(QueueIdT Id) {
+      return MFusionMap.find(Id);
+    }
+
+    void removeNodeFromGraph(Command *Node, std::vector<Command *> &ToEnqueue);
+
+    /// Used to track commands that need to be visited during graph
+    /// traversal.
     std::queue<Command *> MCmdsToVisit;
     /// Used to track commands that have been visited during graph traversal.
     std::vector<Command *> MVisitedCmds;
+
+    /// Used to track queues that are in fusion mode and the
+    /// command-groups/kernels submitted for fusion.
+    FusionMap MFusionMap;
+
     /// Prints contents of graph to text file in DOT format
     ///
     /// \param ModeName is a stringified printing mode name to be used
@@ -665,6 +721,8 @@ class Scheduler {
       AfterAddCopyBack,
       BeforeAddHostAcc,
       AfterAddHostAcc,
+      AfterFusionComplete,
+      AfterFusionCancel,
       Size
     };
     std::array<bool, PrintOptions::Size> MPrintOptionsArray{false};
@@ -765,7 +823,8 @@ class Scheduler {
     ///
     /// The function may unlock and lock GraphReadLock as needed. Upon return
     /// the lock is left in locked state.
-    static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult,
+    static bool enqueueCommand(Command *Cmd, ReadLockT &GraphReadLock,
+                               EnqueueResultT &EnqueueResult,
                                std::vector<Command *> &ToCleanUp,
                                Command *RootCommand,
                                BlockingT Blocking = NON_BLOCKING);
@@ -832,6 +891,11 @@ class Scheduler {
   friend class queue_impl;
   friend class event_impl;
   friend class ::MockScheduler;
+
+private:
+  static void printFusionWarning(const std::string &Message);
+
+  static KernelFusionCommand *isPartOfActiveFusion(Command *Cmd);
 };
 
 } // namespace detail
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index e0c80bca1ef23..c0b5e3881818c 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -147,11 +147,12 @@ event handler::finalize() {
       }
     }
 
-    if (MRequirements.size() + MEvents.size() + MStreamStorage.size() == 0) {
+    if (!MQueue->is_in_fusion_mode() &&
+        MRequirements.size() + MEvents.size() + MStreamStorage.size() == 0) {
       // if user does not add a new dependency to the dependency graph, i.e.
-      // the graph is not changed, then this faster path is used to submit
-      // kernel bypassing scheduler and avoiding CommandGroup, Command objects
-      // creation.
+      // the graph is not changed, and the queue is not in fusion mode, then
+      // this faster path is used to submit kernel bypassing scheduler and
+      // avoiding CommandGroup, Command objects creation.
 
       std::vector<RT::PiEvent> RawEvents;
       detail::EventImplPtr NewEvent;
diff --git a/sycl/unittests/scheduler/CMakeLists.txt b/sycl/unittests/scheduler/CMakeLists.txt
index 64bea3540a92e..ea6c55940e806 100644
--- a/sycl/unittests/scheduler/CMakeLists.txt
+++ b/sycl/unittests/scheduler/CMakeLists.txt
@@ -22,4 +22,5 @@ add_sycl_unittest(SchedulerTests OBJECT
     RunOnHostIntelCG.cpp
     EnqueueWithDependsOnDeps.cpp
     AccessorDefaultCtor.cpp
+    KernelFusion.cpp
 )
diff --git a/sycl/unittests/scheduler/KernelFusion.cpp b/sycl/unittests/scheduler/KernelFusion.cpp
new file mode 100644
index 0000000000000..8b45c03e37f1f
--- /dev/null
+++ b/sycl/unittests/scheduler/KernelFusion.cpp
@@ -0,0 +1,150 @@
+//==----------- KernelFusion.cpp - Kernel Fusion scheduler unit tests ------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SchedulerTest.hpp"
+#include "SchedulerTestUtils.hpp"
+
+#include <helpers/PiMock.hpp>
+#include <helpers/ScopedEnvVar.hpp>
+#include <helpers/TestKernel.hpp>
+
+#include <vector>
+
+using namespace sycl;
+using EventImplPtr = std::shared_ptr<detail::event_impl>;
+
+template <typename T, int Dim>
+detail::Command *CreateTaskCommand(MockScheduler &MS,
+                                   detail::QueueImplPtr DevQueue,
+                                   buffer<T, Dim> &buf) {
+  MockHandlerCustomFinalize MockCGH(DevQueue, false);
+
+  auto acc = buf.get_access(static_cast<sycl::handler &>(MockCGH));
+
+  kernel_bundle KernelBundle =
+      sycl::get_kernel_bundle<sycl::bundle_state::input>(
+          DevQueue->get_context());
+  auto ExecBundle = sycl::build(KernelBundle);
+  MockCGH.use_kernel_bundle(ExecBundle);
+  MockCGH.single_task<TestKernel<>>([] {});
+
+  auto CmdGrp = MockCGH.finalize();
+
+  std::vector<detail::Command *> ToEnqueue;
+  detail::Command *NewCmd = MS.addCG(std::move(CmdGrp), DevQueue, ToEnqueue);
+  EXPECT_EQ(ToEnqueue.size(), 0u);
+  return NewCmd;
+}
+
+bool CheckTestExecRequirements(const platform &plt) {
+  if (plt.is_host()) {
+    std::cout << "Not run due to host-only environment\n";
+    return false;
+  }
+  // This test only contains device image for SPIR-V capable devices.
+  if (plt.get_backend() != sycl::backend::opencl &&
+      plt.get_backend() != sycl::backend::ext_oneapi_level_zero) {
+    std::cout << "Only OpenCL and Level Zero are supported for this test\n";
+    return false;
+  }
+  return true;
+}
+
+bool containsCommand(detail::Command *Cmd,
+                     std::vector<detail::Command *> &List) {
+  return std::find(List.begin(), List.end(), Cmd) != List.end();
+}
+
+bool dependsOnViaDep(detail::Command *Dependent, detail::Command *Dependee) {
+  return std::find_if(Dependent->MDeps.begin(), Dependent->MDeps.end(),
+                      [=](detail::DepDesc &Desc) {
+                        return Desc.MDepCommand == Dependee;
+                      }) != Dependent->MDeps.end();
+}
+
+bool dependsOnViaEvent(detail::Command *Dependent, detail::Command *Dependee) {
+  auto &DepEvents = Dependent->getPreparedDepsEvents();
+  return std::find_if(DepEvents.begin(), DepEvents.end(),
+                      [=](const EventImplPtr &Ev) {
+                        return Ev->getCommand() && Ev->getCommand() == Dependee;
+                      }) != DepEvents.end();
+}
+
+TEST_F(SchedulerTest, CancelKernelFusion) {
+  unittest::PiMock Mock;
+  platform Plt = Mock.getPlatform();
+  if (!CheckTestExecRequirements(Plt))
+    return;
+
+  queue QueueDev(context(Plt), default_selector_v);
+  MockScheduler MS;
+
+  detail::QueueImplPtr QueueDevImpl = detail::getSyclObjImpl(QueueDev);
+
+  // Test scenario: Create four memory objects (buffers) and one command for
+  // each memory object before starting fusion. Then start fusion, again adding
+  // one command with a requirement for each of the memory objects. Then cancel
+  // fusion and check for correct dependencies.
+
+  buffer<int, 1> b1{range<1>{4}};
+  buffer<int, 1> b2{range<1>{4}};
+  buffer<int, 1> b3{range<1>{4}};
+  buffer<int, 1> b4{range<1>{4}};
+
+  auto *nonFusionCmd1 = CreateTaskCommand(MS, QueueDevImpl, b1);
+  auto *nonFusionCmd2 = CreateTaskCommand(MS, QueueDevImpl, b2);
+  auto *nonFusionCmd3 = CreateTaskCommand(MS, QueueDevImpl, b3);
+  auto *nonFusionCmd4 = CreateTaskCommand(MS, QueueDevImpl, b4);
+
+  MS.startFusion(QueueDevImpl);
+
+  auto *fusionCmd1 = CreateTaskCommand(MS, QueueDevImpl, b1);
+  auto *fusionCmd2 = CreateTaskCommand(MS, QueueDevImpl, b2);
+  auto *fusionCmd3 = CreateTaskCommand(MS, QueueDevImpl, b3);
+  auto *fusionCmd4 = CreateTaskCommand(MS, QueueDevImpl, b4);
+
+  std::vector<detail::Command *> ToEnqueue;
+  MS.cancelFusion(QueueDevImpl, ToEnqueue);
+
+  // The list of commands filled by cancelFusion should contain the four
+  // commands submitted while in fusion mode, plus the placeholder command.
+  EXPECT_EQ(ToEnqueue.size(), 5u);
+  EXPECT_TRUE(containsCommand(fusionCmd1, ToEnqueue));
+  EXPECT_TRUE(containsCommand(fusionCmd2, ToEnqueue));
+  EXPECT_TRUE(containsCommand(fusionCmd3, ToEnqueue));
+  EXPECT_TRUE(containsCommand(fusionCmd4, ToEnqueue));
+
+  // Each of the commands submitted while in fusion mode should have exactly one
+  // dependency on the command not participating in fusion, but accessing the
+  // same memory object.
+  EXPECT_TRUE(dependsOnViaDep(fusionCmd1, nonFusionCmd1));
+  EXPECT_EQ(fusionCmd1->MDeps.size(), 1u);
+  EXPECT_TRUE(dependsOnViaDep(fusionCmd2, nonFusionCmd2));
+  EXPECT_EQ(fusionCmd2->MDeps.size(), 1u);
+  EXPECT_TRUE(dependsOnViaDep(fusionCmd3, nonFusionCmd3));
+  EXPECT_EQ(fusionCmd3->MDeps.size(), 1u);
+  EXPECT_TRUE(dependsOnViaDep(fusionCmd4, nonFusionCmd4));
+  EXPECT_EQ(fusionCmd4->MDeps.size(), 1u);
+
+  // There should be one placeholder command in the command list.
+  auto FusionCmdIt = std::find_if(
+      ToEnqueue.begin(), ToEnqueue.end(), [](detail::Command *Cmd) {
+        return Cmd->getType() == sycl::_V1::detail::Command::FUSION;
+      });
+  EXPECT_NE(FusionCmdIt, ToEnqueue.end());
+
+  // Check that the placeholder command has an event dependency on each of the
+  // commands submitted while in fusion mode.
+  auto *placeHolderCmd =
+      static_cast<detail::KernelFusionCommand *>(*FusionCmdIt);
+  EXPECT_EQ(placeHolderCmd->getPreparedDepsEvents().size(), 4u);
+  EXPECT_TRUE(dependsOnViaEvent(placeHolderCmd, fusionCmd2));
+  EXPECT_TRUE(dependsOnViaEvent(placeHolderCmd, fusionCmd3));
+  EXPECT_TRUE(dependsOnViaEvent(placeHolderCmd, fusionCmd4));
+  EXPECT_TRUE(dependsOnViaEvent(placeHolderCmd, fusionCmd1));
+}
diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
index cb40215fbb59e..cfa2a4d98cbd3 100644
--- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp
+++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
@@ -7,10 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
+#include <detail/handler_impl.hpp>
 #include <detail/queue_impl.hpp>
 #include <detail/scheduler/commands.hpp>
 #include <detail/scheduler/scheduler.hpp>
-#include <detail/handler_impl.hpp>
 #include <detail/stream_impl.hpp>
 #include <sycl/detail/cl.h>
 
@@ -143,9 +143,11 @@ class MockScheduler : public sycl::detail::Scheduler {
   static bool enqueueCommand(sycl::detail::Command *Cmd,
                              sycl::detail::EnqueueResultT &EnqueueResult,
                              sycl::detail::BlockingT Blocking) {
+    RWLockT MockLock;
+    ReadLockT MockReadLock(MockLock);
     std::vector<sycl::detail::Command *> ToCleanUp;
-    return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, ToCleanUp, Cmd,
-                                          Blocking);
+    return GraphProcessor::enqueueCommand(Cmd, MockReadLock, EnqueueResult,
+                                          ToCleanUp, Cmd, Blocking);
   }
 
   sycl::detail::AllocaCommandBase *
@@ -197,7 +199,13 @@ class MockScheduler : public sycl::detail::Scheduler {
   addCG(std::unique_ptr<sycl::detail::CG> CommandGroup,
         sycl::detail::QueueImplPtr Queue,
         std::vector<sycl::detail::Command *> &ToEnqueue) {
-    return MGraphBuilder.addCG(std::move(CommandGroup), Queue, ToEnqueue);
+    return MGraphBuilder.addCG(std::move(CommandGroup), Queue, ToEnqueue)
+        .NewCmd;
+  }
+
+  void cancelFusion(sycl::detail::QueueImplPtr Queue,
+                    std::vector<sycl::detail::Command *> &ToEnqueue) {
+    MGraphBuilder.cancelFusion(Queue, ToEnqueue);
   }
 };
 

From eb33bbcfbeab7af1a7f58fb4dc6b53bc47f73dba Mon Sep 17 00:00:00 2001
From: Pavel Chupin <pavel.v.chupin@intel.com>
Date: Fri, 16 Dec 2022 10:11:44 -0800
Subject: [PATCH 08/11] [SYCL][Doc] Add info on PR merge process (#7792)

* Add what to do with unexpected fails
* Add some cross-doc links
---
 CONTRIBUTING.md                         | 12 +++++++++++-
 README.md                               |  2 ++
 sycl/doc/developer/ContributeToDPCPP.md |  4 ++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 787b5e3cf308c..fe6c6d8e953aa 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -81,6 +81,15 @@ approved by an Intel representative.
   - A new approval is needed if the PR was updated (e.g. during code review).
 - Once the PR is approved and all checks have passed, the pull request is
 ready for merge.
+- Sometimes unrelated fails can be observed in the PR. It's author
+responsibility to find/guess the reason of these fails and post a comment in
+the PR with:
+  - possible reason of fails, ideally with a link to the PR, which caused fails
+  - link to other PR(s), which expected to fix fails
+  - person who is currently looking into fails
+  - link to existing open issue
+  - if author cannot identify any of these, the minimal action expected is to
+    open a new [issue](/../../issues)
 
 ### Merge
 
@@ -89,7 +98,8 @@ and merge] and using PR description as the commit message, replacing all
 individual comments made per commit. Authors of the change must ensure PR
 description is up to date at the merge stage, as sometimes comments addressed
 during code reviews can invalidate original PR description. Feel free to ping
-@intel/llvm-gatekeepers if your PR is green and can be merged.
+@intel/llvm-gatekeepers if your PR is green and can be merged. Note that
+gatekeepers will require explanation for any failures observed.
 
 Pulldown from LLVM upstream is done through merge commits to preserve hashes of
 the original commits pulled from the LLVM community repository.
diff --git a/README.md b/README.md
index 68110c280523c..9df66d5bd4e7c 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@ Intel LLVM-based projects:
 - [oneAPI DPC++ compiler](#oneapi-dpc-compiler)
 - [Late-outline OpenMP and OpenMP Offload](#late-outline-openmp-and-openmp-offload)
 
+For general contribution process see [CONTRIBUTING.md](./CONTRIBUTING.md)
+
 ## oneAPI DPC++ compiler
 
 [![](https://spec.oneapi.io/oneapi-logo-white-scaled.jpg)](https://www.oneapi.io/)
diff --git a/sycl/doc/developer/ContributeToDPCPP.md b/sycl/doc/developer/ContributeToDPCPP.md
index 4e8bb5bd1309a..74d3ae00c0c3d 100644
--- a/sycl/doc/developer/ContributeToDPCPP.md
+++ b/sycl/doc/developer/ContributeToDPCPP.md
@@ -1,5 +1,9 @@
 # Contributing to DPC++
 
+## General guidelines
+
+Read [CONTRIBUTING.md](/CONTRIBUTING.md) first.
+
 ## Maintaining stable ABI/API
 
 All changes made to the DPC++ compiler and runtime library should generally

From 236a09d7f6c3b9178c896fb855a3169d9c40b40a Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat@intel.com>
Date: Mon, 19 Dec 2022 03:35:47 -0500
Subject: [PATCH 09/11] [SYCL] Fix build issue that appeared in self-build
 using clang-15 (#7811)

---
 sycl/include/sycl/detail/defines.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/sycl/detail/defines.hpp b/sycl/include/sycl/detail/defines.hpp
index 23df19ce4b284..96fef89592185 100644
--- a/sycl/include/sycl/detail/defines.hpp
+++ b/sycl/include/sycl/detail/defines.hpp
@@ -21,7 +21,7 @@
 #endif
 #endif
 
-#if __has_attribute(sycl_special_class)
+#if __has_attribute(sycl_special_class) && (defined __SYCL_DEVICE_ONLY__)
 #define __SYCL_SPECIAL_CLASS __attribute__((sycl_special_class))
 #else
 #define __SYCL_SPECIAL_CLASS

From abdc0fe3ed9e5a834d5c248149cea91dbbe4bd24 Mon Sep 17 00:00:00 2001
From: Nikita <nikita.kornev@intel.com>
Date: Mon, 19 Dec 2022 10:26:04 +0100
Subject: [PATCH 10/11] [SYCL] Fix handler::require() to accept a
 non-placeholder arg (#7786)

---
 sycl/include/sycl/handler.hpp         | 12 ++++++------
 sycl/unittests/handler/CMakeLists.txt |  1 +
 sycl/unittests/handler/require.cpp    | 19 +++++++++++++++++++
 3 files changed, 26 insertions(+), 6 deletions(-)
 create mode 100644 sycl/unittests/handler/require.cpp

diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index fc4a28e5d6d95..4aaac38846450 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -1469,18 +1469,18 @@ class __SYCL_EXPORT handler {
   use_kernel_bundle(const kernel_bundle<bundle_state::executable> &ExecBundle);
 
   /// Requires access to the memory object associated with the placeholder
-  /// accessor.
+  /// accessor. Calling this function with a non-placeholder accessor has no
+  /// effect.
   ///
   /// The command group has a requirement to gain access to the given memory
   /// object before executing.
   ///
   /// \param Acc is a SYCL accessor describing required memory region.
   template <typename DataT, int Dims, access::mode AccMode,
-            access::target AccTarget>
-  void
-  require(accessor<DataT, Dims, AccMode, AccTarget, access::placeholder::true_t>
-              Acc) {
-    associateWithHandler(&Acc, AccTarget);
+            access::target AccTarget, access::placeholder isPlaceholder>
+  void require(accessor<DataT, Dims, AccMode, AccTarget, isPlaceholder> Acc) {
+    if (Acc.is_placeholder())
+      associateWithHandler(&Acc, AccTarget);
   }
 
   /// Registers event dependencies on this command group.
diff --git a/sycl/unittests/handler/CMakeLists.txt b/sycl/unittests/handler/CMakeLists.txt
index 8e86ce6252a86..eb7fc559ab73c 100644
--- a/sycl/unittests/handler/CMakeLists.txt
+++ b/sycl/unittests/handler/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_sycl_unittest(HandlerTests OBJECT
   SetArgForLocalAccessor.cpp
+  require.cpp
 )
diff --git a/sycl/unittests/handler/require.cpp b/sycl/unittests/handler/require.cpp
new file mode 100644
index 0000000000000..ad952b5026f4c
--- /dev/null
+++ b/sycl/unittests/handler/require.cpp
@@ -0,0 +1,19 @@
+#include <gtest/gtest.h>
+#include <helpers/PiMock.hpp>
+
+#include <sycl/sycl.hpp>
+
+TEST(Require, RequireWithNonPlaceholderAccessor) {
+  sycl::unittest::PiMock Mock;
+  sycl::queue Q;
+  int data = 5;
+  {
+    sycl::buffer<int, 1> buf(&data, 1);
+    Q.submit([&](sycl::handler &h) {
+      auto acc = buf.get_access<sycl::access::mode::read_write>(h);
+      // It should be compilable and does nothing according to the spec
+      h.require(acc);
+    });
+    Q.wait();
+  }
+}

From 0b42d32532cdacf70a6669cfdf4345e18eb1edbb Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 7 Dec 2022 15:08:00 +0000
Subject: [PATCH 11/11] [SYCL][Fusion] Interface with kernel fusion JIT

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 buildbot/configure.py                         |  11 +-
 sycl/CMakeLists.txt                           |   2 +-
 sycl/cmake/modules/AddSYCLUnitTest.cmake      |   5 +
 sycl/doc/GetStartedGuide.md                   |   9 +
 sycl/include/sycl/detail/cg.hpp               |   4 +
 sycl/source/CMakeLists.txt                    |   9 +
 sycl/source/detail/jit_compiler.cpp           | 881 ++++++++++++++++++
 sycl/source/detail/jit_compiler.hpp           |  62 ++
 sycl/source/detail/jit_device_binaries.cpp    | 138 +++
 sycl/source/detail/jit_device_binaries.hpp    | 154 +++
 .../source/detail/scheduler/graph_builder.cpp |   7 +-
 sycl/test/CMakeLists.txt                      |   4 +
 12 files changed, 1283 insertions(+), 3 deletions(-)
 create mode 100644 sycl/source/detail/jit_compiler.cpp
 create mode 100644 sycl/source/detail/jit_compiler.hpp
 create mode 100644 sycl/source/detail/jit_device_binaries.cpp
 create mode 100644 sycl/source/detail/jit_device_binaries.hpp

diff --git a/buildbot/configure.py b/buildbot/configure.py
index f3a43857b7e1a..5260a8c563361 100644
--- a/buildbot/configure.py
+++ b/buildbot/configure.py
@@ -23,6 +23,11 @@ def do_configure(args):
     libclc_amd_target_names = ';amdgcn--;amdgcn--amdhsa'
     libclc_nvidia_target_names = ';nvptx64--;nvptx64--nvidiacl'
 
+    sycl_enable_fusion = "OFF"
+    if not args.disable_fusion:
+        llvm_external_projects += ";sycl-fusion"
+        sycl_enable_fusion = "ON"
+
     if args.llvm_external_projects:
         llvm_external_projects += ";" + args.llvm_external_projects.replace(",", ";")
 
@@ -32,6 +37,7 @@ def do_configure(args):
     xpti_dir = os.path.join(abs_src_dir, "xpti")
     xptifw_dir = os.path.join(abs_src_dir, "xptifw")
     libdevice_dir = os.path.join(abs_src_dir, "libdevice")
+    fusion_dir = os.path.join(abs_src_dir, "sycl-fusion")
     llvm_targets_to_build = args.host_target
     llvm_enable_projects = 'clang;' + llvm_external_projects
     libclc_targets_to_build = ''
@@ -144,6 +150,7 @@ def do_configure(args):
         "-DXPTI_SOURCE_DIR={}".format(xpti_dir),
         "-DLLVM_EXTERNAL_XPTIFW_SOURCE_DIR={}".format(xptifw_dir),
         "-DLLVM_EXTERNAL_LIBDEVICE_SOURCE_DIR={}".format(libdevice_dir),
+        "-DLLVM_EXTERNAL_SYCL_FUSION_SOURCE_DIR={}".format(fusion_dir),
         "-DLLVM_ENABLE_PROJECTS={}".format(llvm_enable_projects),
         "-DLIBCLC_TARGETS_TO_BUILD={}".format(libclc_targets_to_build),
         "-DLIBCLC_GENERATE_REMANGLED_VARIANTS={}".format(libclc_gen_remangled_variants),
@@ -159,7 +166,8 @@ def do_configure(args):
         "-DLLVM_ENABLE_LLD={}".format(llvm_enable_lld),
         "-DXPTI_ENABLE_WERROR={}".format(xpti_enable_werror),
         "-DSYCL_CLANG_EXTRA_FLAGS={}".format(sycl_clang_extra_flags),
-        "-DSYCL_ENABLE_PLUGINS={}".format(';'.join(set(sycl_enabled_plugins)))
+        "-DSYCL_ENABLE_PLUGINS={}".format(';'.join(set(sycl_enabled_plugins))),
+        "-DSYCL_ENABLE_KERNEL_FUSION={}".format(sycl_enable_fusion)
     ]
 
     if args.l0_headers and args.l0_loader:
@@ -238,6 +246,7 @@ def main():
     parser.add_argument("--llvm-external-projects", help="Add external projects to build. Add as comma seperated list.")
     parser.add_argument("--ci-defaults", action="store_true", help="Enable default CI parameters")
     parser.add_argument("--enable-plugin", action='append', help="Enable SYCL plugin")
+    parser.add_argument("--disable-fusion", action="store_true", help="Disable the kernel fusion JIT compiler")
     args = parser.parse_args()
 
     print("args:{}".format(args))
diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index c6ba039f71307..fa6769f5ed5ce 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -145,7 +145,7 @@ install(DIRECTORY ${OpenCL_INCLUDE_DIR}/CL
         COMPONENT OpenCL-Headers)
 
 # Option to enable online kernel fusion via a JIT compiler
-option(SYCL_ENABLE_KERNEL_FUSION "Enable kernel fusion via JIT compiler" OFF)
+option(SYCL_ENABLE_KERNEL_FUSION "Enable kernel fusion via JIT compiler" ON)
 
 # Needed for feature_test.hpp
 if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS)
diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake
index 9f5e6dc30a48c..ea8135be0f09e 100644
--- a/sycl/cmake/modules/AddSYCLUnitTest.cmake
+++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake
@@ -56,6 +56,11 @@ macro(add_sycl_unittest test_dirname link_variant)
       OpenCL-Headers
       ${SYCL_LINK_LIBS}
     )
+
+  if(SYCL_ENABLE_KERNEL_FUSION)
+    target_link_libraries(${test_dirname} PRIVATE sycl-fusion)
+  endif(SYCL_ENABLE_KERNEL_FUSION)
+  
   target_include_directories(${test_dirname}
     PRIVATE SYSTEM
       ${sycl_inc_dir}
diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md
index e48e160458459..d0acb3e31d0e2 100644
--- a/sycl/doc/GetStartedGuide.md
+++ b/sycl/doc/GetStartedGuide.md
@@ -12,6 +12,7 @@ and a wide range of compute accelerators such as GPU and FPGA.
     - [Build DPC++ toolchain with support for HIP AMD](#build-dpc-toolchain-with-support-for-hip-amd)
     - [Build DPC++ toolchain with support for HIP NVIDIA](#build-dpc-toolchain-with-support-for-hip-nvidia)
     - [Build DPC++ toolchain with support for ESIMD CPU Emulation](#build-dpc-toolchain-with-support-for-esimd-emulator)
+    - [Build DPC++ toolchain with support for runtime kernel fusion](#build-dpc-toolchain-with-support-for-runtime-kernel-fusion)
     - [Build Doxygen documentation](#build-doxygen-documentation)
     - [Deployment](#deployment)
   - [Use DPC++ toolchain](#use-dpc-toolchain)
@@ -298,6 +299,14 @@ Enabling this flag requires following packages installed.
 Currently, this feature was tested and verified on Ubuntu 20.04
 environment.
 
+### Build DPC++ toolchain with support for runtime kernel fusion
+
+Support for the experimental SYCL extension for user-driven kernel fusion 
+at runtime is enabled by default. 
+
+To disable support for this feature, follow the instructions for the 
+Linux DPC++ toolchain, but add the `--disable-fusion` flag.
+
 ### Build Doxygen documentation
 
 Building Doxygen documentation is similar to building the product itself. First,
diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp
index 6751dcb542060..b835b8f24968a 100644
--- a/sycl/include/sycl/detail/cg.hpp
+++ b/sycl/include/sycl/detail/cg.hpp
@@ -98,6 +98,10 @@ class CG {
 
   CGTYPE getType() { return MType; }
 
+  std::vector<std::vector<char>> &getArgsStorage() { return MArgsStorage; }
+
+  std::vector<detail::AccessorImplPtr> &getAccStorage() { return MAccStorage; }
+
   virtual ~CG() = default;
 
 private:
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
index ecd54a4ced599..9fabf9bc9270c 100644
--- a/sycl/source/CMakeLists.txt
+++ b/sycl/source/CMakeLists.txt
@@ -87,6 +87,13 @@ function(add_sycl_rt_library LIB_NAME LIB_OBJ_NAME)
     PRIVATE OpenCL-Headers
   )
 
+  if(SYCL_ENABLE_KERNEL_FUSION)
+    target_link_libraries(${LIB_NAME} PRIVATE sycl-fusion)
+    target_link_libraries(${LIB_OBJ_NAME} PRIVATE sycl-fusion)
+    set_property(GLOBAL APPEND PROPERTY SYCL_TOOLCHAIN_INSTALL_COMPONENTS
+      sycl-fusion)
+  endif(SYCL_ENABLE_KERNEL_FUSION)
+
   find_package(Threads REQUIRED)
 
   target_link_libraries(${LIB_NAME}
@@ -139,6 +146,8 @@ set(SYCL_SOURCES
     "detail/handler_proxy.cpp"
     "detail/image_accessor_util.cpp"
     "detail/image_impl.cpp"
+    "detail/jit_compiler.cpp"
+    "detail/jit_device_binaries.cpp"
     "detail/kernel_impl.cpp"
     "detail/kernel_program_cache.cpp"
     "detail/memory_manager.cpp"
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
new file mode 100644
index 0000000000000..970a465b291d6
--- /dev/null
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -0,0 +1,881 @@
+//==--- jit_compiler.cpp - SYCL runtime JIT compiler for kernel fusion -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <sycl/feature_test.hpp>
+#if SYCL_EXT_CODEPLAY_KERNEL_FUSION
+#include <KernelFusion.h>
+#include <detail/device_image_impl.hpp>
+#include <detail/jit_compiler.hpp>
+#include <detail/kernel_bundle_impl.hpp>
+#include <detail/kernel_impl.hpp>
+#include <detail/queue_impl.hpp>
+#include <detail/sycl_mem_obj_t.hpp>
+#include <sycl/detail/pi.hpp>
+#include <sycl/ext/codeplay/experimental/fusion_properties.hpp>
+#include <sycl/kernel_bundle.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+
+jit_compiler::jit_compiler() : MJITContext{new ::jit_compiler::JITContext{}} {}
+
+jit_compiler::~jit_compiler() = default;
+
+static ::jit_compiler::BinaryFormat
+translateBinaryImageFormat(pi::PiDeviceBinaryType Type) {
+  switch (Type) {
+  case PI_DEVICE_BINARY_TYPE_SPIRV:
+    return ::jit_compiler::BinaryFormat::SPIRV;
+  case PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE:
+    return ::jit_compiler::BinaryFormat::LLVM;
+  default:
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "Format unsupported for JIT compiler");
+  }
+}
+
+static ::jit_compiler::ParameterKind
+translateArgType(kernel_param_kind_t Kind) {
+  using PK = ::jit_compiler::ParameterKind;
+  using kind = kernel_param_kind_t;
+  switch (Kind) {
+  case kind::kind_accessor:
+    return PK::Accessor;
+  case kind::kind_std_layout:
+    return PK::StdLayout;
+  case kind::kind_sampler:
+    return PK::Sampler;
+  case kind::kind_pointer:
+    return PK::Pointer;
+  case kind::kind_specialization_constants_buffer:
+    return PK::SpecConstBuffer;
+  case kind::kind_stream:
+    return PK::Stream;
+  case kind::kind_invalid:
+    return PK::Invalid;
+  }
+  return PK::Invalid;
+}
+
+enum class Promotion { None, Private, Local };
+
+struct PromotionInformation {
+  Promotion PromotionTarget;
+  unsigned KernelIndex;
+  unsigned ArgIndex;
+  Requirement *Definition;
+  NDRDescT NDRange;
+  size_t LocalSize;
+  std::vector<bool> UsedParams;
+};
+
+using PromotionMap = std::unordered_map<SYCLMemObjI *, PromotionInformation>;
+
+static inline void printPerformanceWarning(const std::string &Message) {
+  if (detail::SYCLConfig<detail::SYCL_RT_WARNING_LEVEL>::get() > 0) {
+    std::cerr << "WARNING: " << Message << "\n";
+  }
+}
+
+template <typename Obj> Promotion getPromotionTarget(Obj &&obj) {
+  auto Result = Promotion::None;
+  if (obj.template has_property<
+          ext::codeplay::experimental::property::promote_private>()) {
+    Result = Promotion::Private;
+  }
+  if (obj.template has_property<
+          ext::codeplay::experimental::property::promote_local>()) {
+    if (Result != Promotion::None) {
+      throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                            "Two contradicting promotion properties on the "
+                            "same buffer/accessor are not allowed.");
+    }
+    Result = Promotion::Local;
+  }
+  return Result;
+}
+
+static Promotion getInternalizationInfo(Requirement *Req) {
+  auto AccPromotion = getPromotionTarget(Req->MPropertyList);
+
+  auto *MemObj = static_cast<sycl::detail::SYCLMemObjT *>(Req->MSYCLMemObj);
+  if (MemObj->getType() != SYCLMemObjI::MemObjType::Buffer) {
+    // We currently do not support promotion on non-buffer memory objects (e.g.,
+    // images).
+    return Promotion::None;
+  }
+  Promotion BuffPromotion = getPromotionTarget(*MemObj);
+  if (AccPromotion != Promotion::None && BuffPromotion != Promotion::None &&
+      AccPromotion != BuffPromotion) {
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "Contradicting promotion properties on accessor and "
+                          "underlying buffer are not allowed");
+  }
+  return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
+}
+
+static std::optional<size_t> getLocalSize(NDRDescT NDRange, Requirement *Req,
+                                          Promotion Target) {
+  auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
+  if (Target == Promotion::Private) {
+    auto NumWorkItems = NDRange.GlobalSize.size();
+    // For private internalization, the local size is
+    // (Number of elements in buffer)/(number of work-items)
+    return NumElementsMem / NumWorkItems;
+  } else if (Target == Promotion::Local) {
+    if (NDRange.LocalSize.size() == 0) {
+      // No work-group size provided, cannot calculate the local size
+      // and need to bail out.
+      return {};
+    }
+    auto NumWorkGroups = NDRange.GlobalSize.size() / NDRange.LocalSize.size();
+    // For local internalization, the local size is
+    // (Number of elements in buffer)/(number of work-groups)
+    return NumElementsMem / NumWorkGroups;
+  }
+  return 0;
+}
+
+static bool accessorEquals(Requirement *Req, Requirement *Other) {
+  return Req->MOffset == Other->MOffset &&
+         Req->MAccessRange == Other->MAccessRange &&
+         Req->MMemoryRange == Other->MMemoryRange &&
+         Req->MSYCLMemObj == Other->MSYCLMemObj && Req->MDims == Other->MDims &&
+         Req->MElemSize == Other->MElemSize &&
+         Req->MOffsetInBytes == Other->MOffsetInBytes &&
+         Req->MIsSubBuffer == Other->MIsSubBuffer;
+}
+
+static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
+                                   unsigned ArgFunctionIndex, NDRDescT NDRange,
+                                   PromotionMap &Promotions) {
+  assert(Arg.MType == kernel_param_kind_t::kind_accessor);
+
+  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
+
+  auto ThisPromotionTarget = getInternalizationInfo(Req);
+  auto ThisLocalSize = getLocalSize(NDRange, Req, ThisPromotionTarget);
+
+  if (Promotions.count(Req->MSYCLMemObj)) {
+    // We previously encountered an accessor for the same buffer.
+    auto &PreviousDefinition = Promotions.at(Req->MSYCLMemObj);
+
+    switch (ThisPromotionTarget) {
+    case Promotion::None: {
+      if (PreviousDefinition.PromotionTarget != Promotion::None) {
+        printPerformanceWarning(
+            "Deactivating previously specified promotion, because this "
+            "accessor does not specify promotion");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+      }
+      return;
+    }
+    case Promotion::Local: {
+      if (PreviousDefinition.PromotionTarget == Promotion::None) {
+        printPerformanceWarning(
+            "Not performing specified local promotion, due to previous "
+            "mismatch or because previous accessor specified no promotion");
+        return;
+      }
+      if (!ThisLocalSize.has_value()) {
+        printPerformanceWarning("Work-group size for local promotion not "
+                                "specified, not performing internalization");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+        return;
+      }
+      if (PreviousDefinition.PromotionTarget == Promotion::Private) {
+        printPerformanceWarning(
+            "Overriding previous private promotion with local promotion");
+        // Recompute the local size for the previous definition with adapted
+        // promotion target.
+        auto NewPrevLocalSize =
+            getLocalSize(PreviousDefinition.NDRange,
+                         PreviousDefinition.Definition, Promotion::Local);
+
+        if (!NewPrevLocalSize.has_value()) {
+          printPerformanceWarning(
+              "Not performing specified local promotion because previous "
+              "kernels did not specify a local size");
+          PreviousDefinition.PromotionTarget = Promotion::None;
+          return;
+        }
+
+        PreviousDefinition.LocalSize = NewPrevLocalSize.value();
+        PreviousDefinition.PromotionTarget = Promotion::Local;
+      }
+      if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
+        printPerformanceWarning("Not performing specified local promotion due "
+                                "to work-group size mismatch");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+        return;
+      }
+      if (!accessorEquals(Req, PreviousDefinition.Definition)) {
+        printPerformanceWarning("Not performing specified promotion, due to "
+                                "accessor parameter mismatch");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+        return;
+      }
+      return;
+    }
+    case Promotion::Private: {
+      if (PreviousDefinition.PromotionTarget == Promotion::None) {
+        printPerformanceWarning(
+            "Not performing specified private promotion, due to previous "
+            "mismatch or because previous accessor specified no promotion");
+        return;
+      }
+
+      if (PreviousDefinition.PromotionTarget == Promotion::Local) {
+        // Recompute the local size with adapted promotion target.
+        auto ThisLocalSize = getLocalSize(NDRange, Req, Promotion::Local);
+        if (!ThisLocalSize.has_value()) {
+          printPerformanceWarning("Work-group size for local promotion not "
+                                  "specified, not performing internalization");
+          PreviousDefinition.PromotionTarget = Promotion::None;
+          return;
+        }
+
+        if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
+          printPerformanceWarning(
+              "Not performing specified local promotion due "
+              "to work-group size mismatch");
+          PreviousDefinition.PromotionTarget = Promotion::None;
+          return;
+        }
+
+        if (!accessorEquals(Req, PreviousDefinition.Definition)) {
+          printPerformanceWarning("Not performing local promotion, due to "
+                                  "accessor parameter mismatch");
+          PreviousDefinition.PromotionTarget = Promotion::None;
+          return;
+        }
+
+        printPerformanceWarning(
+            "Performing local internalization instead, because previous "
+            "accessor specified local promotion");
+        return;
+      }
+
+      // Previous accessors also specified private promotion.
+      if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
+        printPerformanceWarning(
+            "Not performing specified private promotion due "
+            "to work-group size mismatch");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+        return;
+      }
+      if (!accessorEquals(Req, PreviousDefinition.Definition)) {
+        printPerformanceWarning("Not performing specified promotion, due to "
+                                "accessor parameter mismatch");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+        return;
+      }
+      return;
+    }
+    }
+  } else {
+    if (ThisPromotionTarget == Promotion::Local && !ThisLocalSize.has_value()) {
+      printPerformanceWarning("Work-group size for local promotion not "
+                              "specified, not performing internalization");
+      ThisPromotionTarget = Promotion::None;
+      ThisLocalSize = 0;
+    }
+    assert(ThisLocalSize.has_value());
+    Promotions.emplace(Req->MSYCLMemObj,
+                       PromotionInformation{ThisPromotionTarget, KernelIndex,
+                                            ArgFunctionIndex, Req, NDRange,
+                                            ThisLocalSize.value(),
+                                            std::vector<bool>()});
+  }
+}
+
+// Identify a parameter by the argument description, the kernel index and the
+// parameter index in that kernel.
+struct Param {
+  ArgDesc Arg;
+  unsigned KernelIndex;
+  unsigned ArgIndex;
+  bool Used;
+  Param(ArgDesc Argument, unsigned KernelIdx, unsigned ArgIdx, bool InUse)
+      : Arg{Argument}, KernelIndex{KernelIdx}, ArgIndex{ArgIdx}, Used{InUse} {}
+};
+
+using ParamList = std::vector<Param>;
+
+using ParamIterator = std::vector<Param>::iterator;
+
+std::vector<Param>::const_iterator
+detectIdenticalParameter(std::vector<Param> &Params, ArgDesc Arg) {
+  for (auto I = Params.begin(); I < Params.end(); ++I) {
+    // Two arguments of different type can never be identical.
+    if (I->Arg.MType == Arg.MType) {
+      if (Arg.MType == kernel_param_kind_t::kind_pointer ||
+          Arg.MType == kernel_param_kind_t::kind_std_layout) {
+        // Compare size and, if the size is identical, the content byte-by-byte.
+        if ((Arg.MSize == I->Arg.MSize) &&
+            std::memcmp(Arg.MPtr, I->Arg.MPtr, Arg.MSize) == 0) {
+          return I;
+        }
+      } else if (Arg.MType == kernel_param_kind_t::kind_accessor) {
+        Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
+        Requirement *Other = static_cast<Requirement *>(I->Arg.MPtr);
+        if (accessorEquals(Req, Other)) {
+          return I;
+        }
+      }
+    }
+  }
+  return Params.end();
+}
+
+template <typename T, typename F = typename std::remove_const_t<
+                          typename std::remove_reference_t<T>>>
+F *storePlainArg(std::vector<std::vector<char>> &ArgStorage, T &&Arg) {
+  ArgStorage.emplace_back(sizeof(T));
+  auto Storage = reinterpret_cast<F *>(ArgStorage.back().data());
+  *Storage = Arg;
+  return Storage;
+}
+
+void *storePlainArgRaw(std::vector<std::vector<char>> &ArgStorage, void *ArgPtr,
+                       size_t ArgSize) {
+  ArgStorage.emplace_back(ArgSize);
+  void *Storage = ArgStorage.back().data();
+  std::memcpy(Storage, ArgPtr, ArgSize);
+  return Storage;
+}
+
+static ParamIterator preProcessArguments(
+    std::vector<std::vector<char>> &ArgStorage, ParamIterator Arg,
+    PromotionMap &PromotedAccs,
+    std::vector<::jit_compiler::ParameterInternalization> &InternalizeParams,
+    std::vector<::jit_compiler::JITConstant> &JITConstants,
+    ParamList &NonIdenticalParams,
+    ::jit_compiler::ParamIdentList &ParamIdentities) {
+
+  // Unused arguments are still in the list at this point (because we
+  // need them for accessor handling), but there's not pre-processing
+  // that needs to be done.
+  if (!Arg->Used) {
+    return ++Arg;
+  }
+
+  if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) {
+    // Pointer arguments are only stored in the kernel functor object, which
+    // will go out-of-scope before we execute the fused kernel. Therefore, we
+    // need to copy the pointer (not the memory it's pointing to) to a permanent
+    // location and update the argument.
+    Arg->Arg.MPtr =
+        storePlainArg(ArgStorage, *static_cast<void **>(Arg->Arg.MPtr));
+  }
+  if (Arg->Arg.MType == kernel_param_kind_t::kind_std_layout) {
+    // Standard layout arguments are only stored in the kernel functor object,
+    // which will go out-of-scope before we execute the fused kernel. Therefore,
+    // we need to copy the argument to a permant location and update the
+    // argument.
+    Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
+  }
+  // First check if there's already another parameter with identical
+  // value.
+  auto Identical = detectIdenticalParameter(NonIdenticalParams, Arg->Arg);
+  if (Identical != NonIdenticalParams.end()) {
+    ::jit_compiler::Parameter ThisParam{Arg->KernelIndex, Arg->ArgIndex};
+    ::jit_compiler::Parameter IdenticalParam{Identical->KernelIndex,
+                                             Identical->ArgIndex};
+    ::jit_compiler::ParameterIdentity Identity{ThisParam, IdenticalParam};
+    ParamIdentities.push_back(Identity);
+    return ++Arg;
+  }
+
+  if (Arg->Arg.MType == kernel_param_kind_t::kind_accessor) {
+    // Get local and private promotion information from accessors.
+    Requirement *Req = static_cast<Requirement *>(Arg->Arg.MPtr);
+    auto &Internalization = PromotedAccs.at(Req->MSYCLMemObj);
+    auto PromotionTarget = Internalization.PromotionTarget;
+    if (PromotionTarget == Promotion::Private ||
+        PromotionTarget == Promotion::Local) {
+      // The accessor should be promoted.
+      if (Internalization.KernelIndex == Arg->KernelIndex &&
+          Internalization.ArgIndex == Arg->ArgIndex) {
+        // This is the first accessor for this buffer that should be
+        // internalized.
+        InternalizeParams.emplace_back(
+            ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
+            (PromotionTarget == Promotion::Private)
+                ? ::jit_compiler::Internalization::Private
+                : ::jit_compiler::Internalization::Local,
+            Internalization.LocalSize);
+        // If an accessor will be promoted, i.e., if it has the promotion
+        // property attached to it, the next three arguments, that are
+        // associated with the accessor (access range, memory range, offset),
+        // must not participate in identical parameter detection or constant
+        // propagation, because their values will change if promotion happens.
+        // Therefore, we can just skip them here, but we need to remember which
+        // of them are used.
+        for (unsigned I = 0; I < 4; ++I) {
+          Internalization.UsedParams.push_back(Arg->Used);
+          ++Arg;
+        }
+      } else {
+        // We have previously encountered an accessor the same buffer, which
+        // should be internalized. We can add parameter identities for the
+        // accessor argument and the next three arguments (range, memory range
+        // and offset, if they are used).
+        unsigned Increment = 0;
+        for (unsigned I = 0; I < 4; ++I) {
+          // If the argument is used in both cases, i.e., on the original
+          // accessor to be internalized, and this one, we can insert a
+          // parameter identity.
+          if (Arg->Used && Internalization.UsedParams[I]) {
+            ::jit_compiler::Parameter ThisParam{Arg->KernelIndex,
+                                                Arg->ArgIndex};
+            ::jit_compiler::Parameter IdenticalParam{
+                Internalization.KernelIndex,
+                Internalization.ArgIndex + Increment};
+            ::jit_compiler::ParameterIdentity Identity{ThisParam,
+                                                       IdenticalParam};
+            ParamIdentities.push_back(Identity);
+          }
+          if (Internalization.UsedParams[I]) {
+            ++Increment;
+          }
+          ++Arg;
+        }
+      }
+      return Arg;
+    } else {
+      // The accessor will not be promoted, so it can participate in identical
+      // parameter detection.
+      NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
+                                      true);
+      return ++Arg;
+    }
+  } else if (Arg->Arg.MType == kernel_param_kind_t::kind_std_layout) {
+    // No identical parameter exists, so add this to the list.
+    NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
+                                    true);
+    // Propagate values of scalar parameters as constants to the JIT
+    // compiler.
+    JITConstants.emplace_back(
+        ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
+        Arg->Arg.MPtr, Arg->Arg.MSize);
+    return ++Arg;
+  } else if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) {
+    // No identical parameter exists, so add this to the list.
+    NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
+                                    true);
+    return ++Arg;
+  }
+  return ++Arg;
+}
+
+static void
+updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
+                   NDRDescT NDRange, std::vector<ArgDesc> &FusedArgs,
+                   std::vector<std::vector<char>> &FusedArgStorage) {
+  auto &ArgUsageInfo = FusedKernelInfo.Args.UsageMask;
+  assert(ArgUsageInfo.size() == FusedArgs.size());
+  for (size_t ArgIndex = 0; ArgIndex < ArgUsageInfo.size();) {
+    bool PromotedToPrivate =
+        (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedPrivate);
+    bool PromotedToLocal =
+        (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedLocal);
+    if (PromotedToLocal || PromotedToPrivate) {
+      // For each internalized accessor, we need to override four arguments
+      // (see 'addArgsForGlobalAccessor' in handler.cpp for reference), i.e.,
+      // the pointer itself, plus twice the range and the offset.
+      auto &OldArgDesc = FusedArgs[ArgIndex];
+      assert(OldArgDesc.MType == kernel_param_kind_t::kind_accessor);
+      auto *Req = static_cast<Requirement *>(OldArgDesc.MPtr);
+
+      // The stored args are all three-dimensional, but depending on the
+      // actual number of dimensions of the accessor, only a part of that
+      // argument is later on passed to the kernel.
+      const size_t SizeAccField =
+          sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
+      // Compute the local size and use it for the range parameters.
+      auto LocalSize = getLocalSize(NDRange, Req,
+                                    (PromotedToPrivate) ? Promotion::Private
+                                                        : Promotion::Local);
+      range<3> AccessRange{1, 1, LocalSize.value()};
+      auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
+      // Use all-zero as the offset
+      id<3> AcessOffset{0, 0, 0};
+      auto *OffsetArg = storePlainArg(FusedArgStorage, AcessOffset);
+
+      // Override the arguments.
+      // 1. Override the pointer with a std-layout argument with 'nullptr' as
+      // value. handler.cpp does the same for local accessors.
+      int SizeInBytes = Req->MElemSize * LocalSize.value();
+      FusedArgs[ArgIndex] =
+          ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes,
+                  static_cast<int>(ArgIndex)};
+      ++ArgIndex;
+      // 2. Access Range
+      FusedArgs[ArgIndex] =
+          ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg,
+                  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
+      ++ArgIndex;
+      // 3. Memory Range
+      FusedArgs[ArgIndex] =
+          ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg,
+                  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
+      ++ArgIndex;
+      // 4. Offset
+      FusedArgs[ArgIndex] =
+          ArgDesc{kernel_param_kind_t::kind_std_layout, OffsetArg,
+                  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
+      ++ArgIndex;
+    } else {
+      ++ArgIndex;
+    }
+  }
+}
+
+std::unique_ptr<detail::CG>
+jit_compiler::fuseKernels(QueueImplPtr Queue,
+                          std::vector<ExecCGCommand *> &InputKernels,
+                          const property_list &PropList) {
+  // Retrieve the device binary from each of the input
+  // kernels to hand them over to the JIT compiler.
+  std::vector<::jit_compiler::SYCLKernelInfo> InputKernelInfo;
+  std::vector<std::string> InputKernelNames;
+  // Collect argument information from all input kernels.
+  std::vector<std::vector<char>> ArgsStorage;
+  std::vector<detail::AccessorImplPtr> AccStorage;
+  std::vector<Requirement *> Requirements;
+  std::vector<detail::EventImplPtr> Events;
+  NDRDescT NDRDesc;
+  unsigned KernelIndex = 0;
+  ParamList FusedParams;
+  PromotionMap PromotedAccs;
+  // TODO(Lukas, ONNX-399): Collect information about streams and auxiliary
+  // resources (which contain reductions) and figure out how to fuse them.
+  for (auto &RawCmd : InputKernels) {
+    auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
+    auto &CG = KernelCmd->getCG();
+    assert(CG.getType() == CG::Kernel);
+    auto *KernelCG = static_cast<CGExecKernel *>(&CG);
+
+    auto KernelName = KernelCG->MKernelName;
+    if (KernelName.empty()) {
+      printPerformanceWarning(
+          "Cannot fuse kernel with invalid kernel function name");
+      return nullptr;
+    }
+    const RTDeviceBinaryImage *DeviceImage = nullptr;
+    RT::PiProgram Program = nullptr;
+    if (KernelCG->getKernelBundle() != nullptr) {
+      // Retrieve the device image from the kernel bundle.
+      auto KernelBundle = KernelCG->getKernelBundle();
+      kernel_id KernelID =
+          detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
+
+      auto SyclKernel = detail::getSyclObjImpl(
+          KernelBundle->get_kernel(KernelID, KernelBundle));
+
+      DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
+      Program = SyclKernel->getDeviceImage()->get_program_ref();
+    } else if (KernelCG->MSyclKernel != nullptr) {
+      DeviceImage =
+          KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
+      Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
+    } else {
+      auto ContextImpl = Queue->getContextImplPtr();
+      auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+      auto DeviceImpl = Queue->getDeviceImplPtr();
+      auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+      DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
+          KernelCG->MOSModuleHandle, KernelName, Context, Device);
+      Program = detail::ProgramManager::getInstance().createPIProgram(
+          *DeviceImage, Context, Device);
+    }
+    if (!DeviceImage || !Program) {
+      printPerformanceWarning("No suitable IR available for fusion");
+      return nullptr;
+    }
+    ProgramManager::KernelArgMask EliminatedArgs;
+    if (Program && (KernelCG->MSyclKernel == nullptr ||
+                    !KernelCG->MSyclKernel->isCreatedFromSource())) {
+      EliminatedArgs =
+          detail::ProgramManager::getInstance().getEliminatedKernelArgMask(
+              KernelCG->MOSModuleHandle, Program, KernelName);
+    }
+
+    // Collect information about the arguments of this kernel.
+
+    // Might need to sort the arguments in case they are not already sorted,
+    // see also the similar code in commands.cpp.
+    auto Args = KernelCG->MArgs;
+    std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
+      return A.MIndex < B.MIndex;
+    });
+
+    ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor;
+    size_t ArgIndex = 0;
+    // The kernel function in SPIR-V will only have the non-eliminated
+    // arguments, so keep track of this "actual" argument index.
+    unsigned ArgFunctionIndex = 0;
+    for (auto &Arg : Args) {
+      ArgDescriptor.Kinds.push_back(translateArgType(Arg.MType));
+      // DPC++ internally uses 'true' to indicate that an argument has been
+      // eliminated, while the JIT compiler uses 'true' to indicate an
+      // argument is used. Translate this here.
+      bool Eliminated = !EliminatedArgs.empty() && EliminatedArgs[ArgIndex++];
+      ArgDescriptor.UsageMask.emplace_back(!Eliminated);
+
+      // If the argument has not been eliminated, i.e., is still present on
+      // the kernel function in LLVM-IR/SPIR-V, collect information about the
+      // argument for performance optimizations in the JIT compiler.
+      if (!Eliminated) {
+        if (Arg.MType == kernel_param_kind_t::kind_accessor) {
+          resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
+                                 KernelCG->MNDRDesc, PromotedAccs);
+        }
+        FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true);
+        ++ArgFunctionIndex;
+      } else {
+        FusedParams.emplace_back(Arg, KernelIndex, 0, false);
+      }
+    }
+
+    // TODO(Lukas, ONNX-399): Check for the correct kernel bundle state of the
+    // device image?
+    auto &RawDeviceImage = DeviceImage->getRawData();
+    auto DeviceImageSize = static_cast<size_t>(RawDeviceImage.BinaryEnd -
+                                               RawDeviceImage.BinaryStart);
+    // Set 0 as the number of address bits, because the JIT compiler can set
+    // this field based on information from SPIR-V/LLVM module's data-layout.
+    auto BinaryImageFormat =
+        translateBinaryImageFormat(DeviceImage->getFormat());
+    if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
+      printPerformanceWarning("No suitable IR available for fusion");
+      return nullptr;
+    }
+    ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
+        translateBinaryImageFormat(DeviceImage->getFormat()), 0,
+        RawDeviceImage.BinaryStart, DeviceImageSize};
+
+    InputKernelInfo.emplace_back(KernelName, ArgDescriptor, BinInfo);
+    InputKernelNames.push_back(KernelName);
+
+    // Collect information for the fused kernel
+
+    // TODO(Lukas, ONNX-399): Currently assuming the NDRDesc is identical for
+    // all input kernels. Actually verify this here or in the graph_builder.
+    auto &CurrentNDR = KernelCG->MNDRDesc;
+    if (CurrentNDR.GlobalSize[0] == 0 && CurrentNDR.NumWorkGroups[0] != 0) {
+      // Some overloads of parallel_for_work_group only specify the number of
+      // work-groups, so this can be used to identify hierarchical parallel
+      // kernels, which are not supported by fusion for now.
+      printPerformanceWarning(
+          "Cannot fuse kernel with hierarchical parallelism");
+      return nullptr;
+      // Not all overloads of parallel_for_work_group only specify the number of
+      // work-groups, so the above mechanism might not detect all hierarchical
+      // parallelism.
+      // TODO(Lukas, CRD-6): Find a more reliable way to detect hierarchical
+      // parallelism.
+    }
+    if (KernelIndex == 0) {
+      NDRDesc = CurrentNDR;
+    } else {
+      if (CurrentNDR.Dims != NDRDesc.Dims) {
+        printPerformanceWarning(
+            "Cannot fuse kernels with different dimensionality");
+        return nullptr;
+      }
+      if (CurrentNDR.GlobalOffset != NDRDesc.GlobalOffset) {
+        printPerformanceWarning(
+            "Cannot fuse kernels with different global offset");
+        return nullptr;
+      }
+      if (CurrentNDR.GlobalSize != NDRDesc.GlobalSize) {
+        printPerformanceWarning(
+            "Cannot fuse kerneles with different global size");
+        return nullptr;
+      }
+      if (CurrentNDR.LocalSize[0] != 0 &&
+          CurrentNDR.LocalSize != NDRDesc.LocalSize) {
+        printPerformanceWarning(
+            "Cannot fuse kernels with different local size");
+        return nullptr;
+      }
+    }
+    // We need to copy the storages here. The input CGs might be eliminated
+    // before the fused kernel gets executed, so we need to copy the storages
+    // here to make sure the arguments don't die on us before executing the
+    // fused kernel.
+    ArgsStorage.insert(ArgsStorage.end(), KernelCG->getArgsStorage().begin(),
+                       KernelCG->getArgsStorage().end());
+    AccStorage.insert(AccStorage.end(), KernelCG->getAccStorage().begin(),
+                      KernelCG->getAccStorage().end());
+    // TODO(Lukas, ONNX-399): Does the MSharedPtrStorage contain any
+    // information about actual shared pointers beside the kernel bundle and
+    // handler impl? If yes, we might need to copy it here.
+    Requirements.insert(Requirements.end(), KernelCG->MRequirements.begin(),
+                        KernelCG->MRequirements.end());
+    Events.insert(Events.end(), KernelCG->MEvents.begin(),
+                  KernelCG->MEvents.end());
+    ++KernelIndex;
+  }
+
+  // Pre-process the arguments, to detect identical parameters or arguments that
+  // can be constant-propagated by the JIT compiler.
+  std::vector<::jit_compiler::ParameterInternalization> InternalizeParams;
+  std::vector<::jit_compiler::JITConstant> JITConstants;
+  ::jit_compiler::ParamIdentList ParamIdentities;
+  ParamList NonIdenticalParameters;
+  for (auto PI = FusedParams.begin(); PI != FusedParams.end();) {
+    PI = preProcessArguments(ArgsStorage, PI, PromotedAccs, InternalizeParams,
+                             JITConstants, NonIdenticalParameters,
+                             ParamIdentities);
+  }
+
+  // Retrieve barrier flags.
+  int BarrierFlags =
+      (PropList
+           .has_property<ext::codeplay::experimental::property::no_barriers>())
+          ? -1
+          : 3;
+
+  static size_t FusedKernelNameIndex = 0;
+  std::stringstream FusedKernelName;
+  FusedKernelName << "fused_" << FusedKernelNameIndex++;
+  ::jit_compiler::Config JITConfig;
+  bool DebugEnabled =
+      detail::SYCLConfig<detail::SYCL_RT_WARNING_LEVEL>::get() > 0;
+  JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled);
+  // TODO: Enable caching in a separate PR.
+
+  auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels(
+      *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames,
+      FusedKernelName.str(), ParamIdentities, BarrierFlags, InternalizeParams,
+      JITConstants);
+
+  if (FusionResult.failed()) {
+    if (DebugEnabled) {
+      std::cerr
+          << "ERROR: JIT compilation for kernel fusion failed with message:\n"
+          << FusionResult.getErrorMessage() << "\n";
+    }
+    return nullptr;
+  }
+
+  auto &FusedKernelInfo = FusionResult.getKernelInfo();
+
+  std::vector<ArgDesc> FusedArgs;
+  int FusedArgIndex = 0;
+  for (auto &Param : FusedParams) {
+    // Add to the argument list of the fused kernel, but with the correct
+    // new index in the fused kernel.
+    auto &Arg = Param.Arg;
+    FusedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, FusedArgIndex++);
+  }
+
+  // Update the kernel arguments for internalized accessors.
+  updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
+
+  if (!FusionResult.cached()) {
+    auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo);
+    detail::ProgramManager::getInstance().addImages(PIDeviceBinaries);
+  } else if (DebugEnabled) {
+    std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
+  }
+
+  // Create a kernel bundle for the fused kernel.
+  // Kernel bundles are stored in the CG as one of the "extended" members.
+  auto FusedKernelId = detail::ProgramManager::getInstance().getSYCLKernelID(
+      FusedKernelInfo.Name);
+  std::vector<std::shared_ptr<const void>> RawExtendedMembers;
+
+  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
+      detail::getSyclObjImpl(get_kernel_bundle<bundle_state::executable>(
+          Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
+
+  std::unique_ptr<detail::CG> FusedCG;
+  FusedCG.reset(new detail::CGExecKernel(
+      NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
+      std::move(ArgsStorage), std::move(AccStorage),
+      std::move(RawExtendedMembers), std::move(Requirements), std::move(Events),
+      std::move(FusedArgs), FusedKernelInfo.Name, OSUtil::DummyModuleHandle, {},
+      {}, CG::CGTYPE::Kernel));
+  return FusedCG;
+}
+
+pi_device_binaries jit_compiler::createPIDeviceBinary(
+    const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo) {
+
+  DeviceBinaryContainer Binary;
+
+  // Create an offload entry for the fused kernel.
+  // It seems to be OK to set zero for most of the information here, at least
+  // that is the case for compiled SPIR-V binaries.
+  OffloadEntryContainer Entry{FusedKernelInfo.Name, nullptr, 0, 0, 0};
+  Binary.addOffloadEntry(std::move(Entry));
+
+  // Create a property entry for the argument usage mask for the fused kernel.
+  auto ArgMask = encodeArgUsageMask(FusedKernelInfo.Args.UsageMask);
+  PropertyContainer ArgMaskProp{FusedKernelInfo.Name, ArgMask.data(),
+                                ArgMask.size(),
+                                pi_property_type::PI_PROPERTY_TYPE_BYTE_ARRAY};
+
+  // Create a property set for the argument usage masks of all kernels
+  // (currently only one).
+  PropertySetContainer ArgMaskPropSet{
+      __SYCL_PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO};
+
+  ArgMaskPropSet.addProperty(std::move(ArgMaskProp));
+
+  Binary.addProperty(std::move(ArgMaskPropSet));
+
+  DeviceBinariesCollection Collection;
+  Collection.addDeviceBinary(std::move(Binary),
+                             FusedKernelInfo.BinaryInfo.BinaryStart,
+                             FusedKernelInfo.BinaryInfo.BinarySize,
+                             FusedKernelInfo.BinaryInfo.AddressBits);
+
+  JITDeviceBinaries.push_back(std::move(Collection));
+  return JITDeviceBinaries.back().getPIDeviceStruct();
+}
+
+std::vector<uint8_t> jit_compiler::encodeArgUsageMask(
+    const ::jit_compiler::ArgUsageMask &Mask) const {
+  // This must match the decoding logic in program_manager.cpp.
+  constexpr uint64_t NBytesForSize = 8;
+  constexpr uint64_t NBitsInElement = 8;
+  uint64_t Size = static_cast<uint64_t>(Mask.size());
+  // Round the size to the next multiple of 8
+  uint64_t RoundedSize =
+      ((Size + (NBitsInElement - 1)) & (~(NBitsInElement - 1)));
+  std::vector<uint8_t> Encoded((RoundedSize / NBitsInElement) + NBytesForSize,
+                               0u);
+  // First encode the size of the actual mask
+  for (size_t i = 0; i < NBytesForSize; ++i) {
+    uint8_t Byte =
+        static_cast<uint8_t>((RoundedSize >> i * NBitsInElement) & 0xFF);
+    Encoded[i] = Byte;
+  }
+  // Encode the actual mask bit-wise
+  for (size_t i = 0; i < Size; ++i) {
+    // DPC++ internally uses 'true' to indicate that an argument has been
+    // eliminated, while the JIT compiler uses 'true' to indicate an argument
+    // is used. Translate this here.
+    if (!(Mask[i] & ::jit_compiler::ArgUsage::Used)) {
+      uint8_t &Byte = Encoded[NBytesForSize + (i / NBitsInElement)];
+      Byte |= static_cast<uint8_t>((1 << (i % NBitsInElement)));
+    }
+  }
+  return Encoded;
+}
+
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
+
+#endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
new file mode 100644
index 0000000000000..522c0749ef75b
--- /dev/null
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -0,0 +1,62 @@
+//==--- jit_compiler.hpp - SYCL runtime JIT compiler for kernel fusion -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <detail/jit_device_binaries.hpp>
+#include <detail/scheduler/commands.hpp>
+#include <detail/scheduler/scheduler.hpp>
+
+namespace jit_compiler {
+class JITContext;
+struct SYCLKernelInfo;
+using ArgUsageMask = std::vector<unsigned char>;
+} // namespace jit_compiler
+
+struct pi_device_binaries_struct;
+struct _pi_offload_entry_struct;
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+
+class jit_compiler {
+
+public:
+  std::unique_ptr<detail::CG>
+  fuseKernels(QueueImplPtr Queue, std::vector<ExecCGCommand *> &InputKernels,
+              const property_list &);
+
+  static jit_compiler &get_instance() {
+    static jit_compiler instance{};
+    return instance;
+  }
+
+private:
+  jit_compiler();
+  ~jit_compiler();
+  jit_compiler(const jit_compiler &) = delete;
+  jit_compiler(jit_compiler &&) = delete;
+  jit_compiler &operator=(const jit_compiler &) = delete;
+  jit_compiler &operator=(const jit_compiler &&) = delete;
+
+  pi_device_binaries
+  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo);
+
+  std::vector<uint8_t>
+  encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const;
+
+  // Manages the lifetime of the PI structs for device binaries.
+  std::vector<DeviceBinariesCollection> JITDeviceBinaries;
+
+  std::unique_ptr<::jit_compiler::JITContext> MJITContext;
+};
+
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp
new file mode 100644
index 0000000000000..3023e1ebb14e6
--- /dev/null
+++ b/sycl/source/detail/jit_device_binaries.cpp
@@ -0,0 +1,138 @@
+//==--- jit_compiler.cpp - Constrution of PI device binaries at runtime ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <detail/jit_device_binaries.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+
+OffloadEntryContainer::OffloadEntryContainer(const std::string &Name,
+                                             void *Addr, size_t Size,
+                                             int32_t Flags, int32_t Reserved)
+    : KernelName{new char[Name.length() + 1]}, Address{Addr}, EntrySize{Size},
+      EntryFlags{Flags}, EntryReserved{Reserved} {
+  std::memcpy(KernelName.get(), Name.c_str(), Name.length() + 1);
+}
+
+_pi_offload_entry_struct OffloadEntryContainer::getPIOffloadEntry() {
+  return _pi_offload_entry_struct{Address, KernelName.get(), EntrySize,
+                                  EntryFlags, EntryReserved};
+}
+
+PropertyContainer::PropertyContainer(const std::string &Name, void *Data,
+                                     size_t Size, uint32_t Type)
+    : PropName{new char[Name.length() + 1]}, Value{new unsigned char[Size]},
+      ValueSize{Size}, PropType{Type} {
+  std::memcpy(PropName.get(), Name.c_str(), Name.length() + 1);
+  std::memcpy(Value.get(), Data, Size);
+}
+
+_pi_device_binary_property_struct PropertyContainer::getPIProperty() {
+  return _pi_device_binary_property_struct{PropName.get(), Value.get(),
+                                           PropType, ValueSize};
+}
+
+PropertySetContainer::PropertySetContainer(const std::string &Name)
+    : SetName{new char[Name.length() + 1]} {
+  std::memcpy(SetName.get(), Name.c_str(), Name.length() + 1);
+}
+
+void PropertySetContainer::addProperty(PropertyContainer &&Prop) {
+  // Adding to the vectors might trigger reallocation, which would invalidate
+  // the pointers used for PI structs if a PI struct has already been created
+  // via getPIPropertySet(). Forbid calls to this method after the first PI
+  // struct has been created.
+  assert(Fused && "Adding to container would invalidate existing PI structs");
+  PIProperties.push_back(Prop.getPIProperty());
+  Properties.push_back(std::move(Prop));
+}
+
+_pi_device_binary_property_set_struct PropertySetContainer::getPIPropertySet() {
+  Fused = false;
+  return _pi_device_binary_property_set_struct{
+      const_cast<char *>(SetName.get()), PIProperties.data(),
+      PIProperties.data() + Properties.size()};
+}
+
+void DeviceBinaryContainer::addOffloadEntry(OffloadEntryContainer &&Cont) {
+  // Adding to the vectors might trigger reallocation, which would invalidate
+  // the pointers used for PI structs if a PI struct has already been created
+  // via getPIDeviceBinary(). Forbid calls to this method after the first PI
+  // struct has been created.
+  assert(Fused && "Adding to container would invalidate existing PI structs");
+  PIOffloadEntries.push_back(Cont.getPIOffloadEntry());
+  OffloadEntries.push_back(std::move(Cont));
+}
+
+void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) {
+  // Adding to the vectors might trigger reallocation, which would invalidate
+  // the pointers used for PI structs if a PI struct has already been created
+  // via getPIDeviceBinary(). Forbid calls to this method after the first PI
+  // struct has been created.
+  assert(Fused && "Adding to container would invalidate existing PI structs");
+  PIPropertySets.push_back(Cont.getPIPropertySet());
+  PropertySets.push_back(std::move(Cont));
+}
+
+pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
+    const unsigned char *BinaryStart, size_t BinarySize, size_t AddressBits) {
+  pi_device_binary_struct DeviceBinary;
+  DeviceBinary.Version = PI_DEVICE_BINARY_VERSION;
+  DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL;
+  DeviceBinary.CompileOptions = "";
+  DeviceBinary.LinkOptions = "";
+  DeviceBinary.ManifestStart = nullptr;
+  DeviceBinary.ManifestEnd = nullptr;
+  // It is safe to use these pointers here, as their lifetime is managed by
+  // the JITContext.
+  DeviceBinary.BinaryStart = BinaryStart;
+  DeviceBinary.BinaryEnd = BinaryStart + BinarySize;
+  DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV;
+  DeviceBinary.DeviceTargetSpec = (AddressBits == 32)
+                                      ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32
+                                      : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64;
+  DeviceBinary.EntriesBegin = PIOffloadEntries.data();
+  DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size();
+  DeviceBinary.PropertySetsBegin = PIPropertySets.data();
+  DeviceBinary.PropertySetsEnd = PIPropertySets.data() + PIPropertySets.size();
+  Fused = false;
+  return DeviceBinary;
+}
+
+void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont,
+                                               const unsigned char *BinaryStart,
+                                               size_t BinarySize,
+                                               size_t AddressBits) {
+  // Adding to the vectors might trigger reallocation, which would invalidate
+  // the pointers used for PI structs if a PI struct has already been created
+  // via getPIDeviceStruct(). Forbid calls to this method after the first PI
+  // struct has been created.
+  assert(Fused && "Adding to container would invalidate existing PI structs");
+  PIBinaries.push_back(
+      Cont.getPIDeviceBinary(BinaryStart, BinarySize, AddressBits));
+  Binaries.push_back(std::move(Cont));
+}
+
+pi_device_binaries DeviceBinariesCollection::getPIDeviceStruct() {
+
+  PIStruct = std::make_unique<pi_device_binaries_struct>();
+  PIStruct->Version = PI_DEVICE_BINARIES_VERSION;
+  PIStruct->NumDeviceBinaries = PIBinaries.size();
+  PIStruct->DeviceBinaries = PIBinaries.data();
+  // According to documentation in pi.h, the HostEntries are not used and
+  // can therefore be null.
+  PIStruct->HostEntriesBegin = nullptr;
+  PIStruct->HostEntriesEnd = nullptr;
+  Fused = false;
+  return PIStruct.get();
+}
+
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp
new file mode 100644
index 0000000000000..400c6014761bb
--- /dev/null
+++ b/sycl/source/detail/jit_device_binaries.hpp
@@ -0,0 +1,154 @@
+//==--- jit_compiler.hpp - Constrution of PI device binaries at runtime ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstring>
+#include <memory>
+#include <sycl/detail/pi.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+
+/// Representation of _pi_offload_entry for creation of JIT device binaries at
+/// runtime.
+/// Owns the necessary data and provides raw pointers for the PI struct.
+class OffloadEntryContainer {
+public:
+  OffloadEntryContainer(const std::string &Name, void *Addr, size_t Size,
+                        int32_t Flags, int32_t Reserved);
+
+  OffloadEntryContainer(OffloadEntryContainer &&) = default;
+  OffloadEntryContainer &operator=(OffloadEntryContainer &&) = default;
+  ~OffloadEntryContainer() = default;
+  // Copying of the container is not allowed.
+  OffloadEntryContainer(const OffloadEntryContainer &) = delete;
+  OffloadEntryContainer &operator=(const OffloadEntryContainer &) = delete;
+
+  _pi_offload_entry_struct getPIOffloadEntry();
+
+private:
+  std::unique_ptr<char[]> KernelName;
+
+  void *Address;
+  size_t EntrySize;
+  int32_t EntryFlags;
+  int32_t EntryReserved;
+};
+
+/// Representation of _pi_device_binary_property_struct for creation of JIT
+/// device binaries at runtime.
+/// Owns the necessary data and provides raw pointers for the PI struct.
+class PropertyContainer {
+
+public:
+  PropertyContainer(const std::string &Name, void *Data, size_t Size,
+                    uint32_t Type);
+
+  PropertyContainer(PropertyContainer &&) = default;
+  PropertyContainer &operator=(PropertyContainer &&) = default;
+  ~PropertyContainer() = default;
+  // Copying of the container is not allowed.
+  PropertyContainer(const PropertyContainer &) = delete;
+  PropertyContainer &operator=(const PropertyContainer &) = delete;
+
+  _pi_device_binary_property_struct getPIProperty();
+
+private:
+  std::unique_ptr<char[]> PropName;
+  std::unique_ptr<unsigned char[]> Value;
+  size_t ValueSize;
+  uint32_t PropType;
+};
+
+/// Representation of _pi_device_binary_property_set_struct for creation of JIT
+/// device binaries at runtime.
+/// Owns the necessary data and provides raw pointers for the PI struct.
+class PropertySetContainer {
+public:
+  PropertySetContainer(const std::string &Name);
+
+  PropertySetContainer(PropertySetContainer &&) = default;
+  PropertySetContainer &operator=(PropertySetContainer &&) = default;
+  ~PropertySetContainer() = default;
+  // Copying of the container is not allowed, as it would invalidate PI structs.
+  PropertySetContainer(const PropertySetContainer &) = delete;
+  PropertySetContainer &operator=(const PropertySetContainer &) = delete;
+
+  void addProperty(PropertyContainer &&Prop);
+
+  _pi_device_binary_property_set_struct getPIPropertySet();
+
+private:
+  std::unique_ptr<char[]> SetName;
+  bool Fused = true;
+  std::vector<PropertyContainer> Properties;
+  std::vector<_pi_device_binary_property_struct> PIProperties;
+};
+
+/// Representation of pi_device_binary_struct for creation of JIT device
+/// binaries at runtime.
+/// Owns the necessary data and provides raw pointers for the PI struct.
+class DeviceBinaryContainer {
+public:
+  DeviceBinaryContainer() = default;
+  DeviceBinaryContainer(DeviceBinaryContainer &&) = default;
+  DeviceBinaryContainer &operator=(DeviceBinaryContainer &&) = default;
+  ~DeviceBinaryContainer() = default;
+  // Copying of the container is not allowed, as it would invalidate PI structs.
+  DeviceBinaryContainer(const DeviceBinaryContainer &) = delete;
+  DeviceBinaryContainer &operator=(const DeviceBinaryContainer &) = delete;
+
+  void addOffloadEntry(OffloadEntryContainer &&Cont);
+
+  void addProperty(PropertySetContainer &&Cont);
+
+  pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart,
+                                            size_t BinarySize,
+                                            size_t AddressBits);
+
+private:
+  bool Fused = true;
+  std::vector<OffloadEntryContainer> OffloadEntries;
+  std::vector<_pi_offload_entry_struct> PIOffloadEntries;
+  std::vector<PropertySetContainer> PropertySets;
+  std::vector<_pi_device_binary_property_set_struct> PIPropertySets;
+};
+
+/// Representation of pi_device_binaries_struct for creation of JIT device
+/// binaries at runtime.
+/// Owns the necessary data and provides raw pointers for the PI struct.
+class DeviceBinariesCollection {
+
+public:
+  DeviceBinariesCollection() = default;
+  DeviceBinariesCollection(DeviceBinariesCollection &&) = default;
+  DeviceBinariesCollection &operator=(DeviceBinariesCollection &&) = default;
+  ~DeviceBinariesCollection() = default;
+  // Copying of the container is not allowed.
+  DeviceBinariesCollection(const DeviceBinariesCollection &) = delete;
+  DeviceBinariesCollection &
+  operator=(const DeviceBinariesCollection &) = delete;
+
+  void addDeviceBinary(DeviceBinaryContainer &&Cont,
+                       const unsigned char *BinaryStart, size_t BinarySize,
+                       size_t AddressBits);
+  pi_device_binaries getPIDeviceStruct();
+
+private:
+  bool Fused = true;
+  std::unique_ptr<pi_device_binaries_struct> PIStruct;
+
+  std::vector<DeviceBinaryContainer> Binaries;
+  std::vector<pi_device_binary_struct> PIBinaries;
+};
+
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index d3ee02d6e1f17..6e6ff4288eff2 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -9,6 +9,10 @@
 #include "detail/config.hpp"
 #include <detail/context_impl.hpp>
 #include <detail/event_impl.hpp>
+#include <sycl/feature_test.hpp>
+#if SYCL_EXT_CODEPLAY_KERNEL_FUSION
+#include <detail/jit_compiler.hpp>
+#endif
 #include <detail/memory_manager.hpp>
 #include <detail/queue_impl.hpp>
 #include <detail/scheduler/scheduler.hpp>
@@ -1432,7 +1436,8 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue,
 
   // TODO: The logic to invoke the JIT compiler to create a fused kernel from
   // the list will be added in a later PR.
-  auto FusedCG = nullptr;
+  auto FusedCG = detail::jit_compiler::get_instance().fuseKernels(
+      Queue, CmdList, PropList);
 
   if (!FusedCG) {
     // If the JIT compiler returns a nullptr, JIT compilation of the fused
diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt
index e90dbf958d1bf..e5aa70cd4c9f3 100644
--- a/sycl/test/CMakeLists.txt
+++ b/sycl/test/CMakeLists.txt
@@ -112,3 +112,7 @@ if(SYCL_BUILD_PI_HIP)
 
   add_dependencies(check-sycl check-sycl-hip)
 endif()
+
+if(SYCL_ENABLE_KERNEL_FUSION)
+  add_dependencies(check-sycl check-sycl-fusion)
+endif(SYCL_ENABLE_KERNEL_FUSION)