intel · alexbaden · Jun 28, 2023 · Feb 6, 2023 · Jun 19, 2023 · Jun 21, 2023
diff --git a/omniscidb/L0Mgr/L0Mgr.cpp b/omniscidb/L0Mgr/L0Mgr.cpp
@@ -19,6 +19,7 @@
 #include "Logger/Logger.h"
 #include "Utils.h"
 
+#include <algorithm>
 #include <iostream>
 #include <limits>
 
@@ -183,6 +184,9 @@ uint32_t L0Device::maxGroupCount() const {
 uint32_t L0Device::maxGroupSize() const {
   return compute_props_.maxGroupSizeX;
 }
+unsigned L0Device::maxSharedLocalMemory() const {
+  return compute_props_.maxSharedLocalMemory;
+}
 
 L0CommandQueue::L0CommandQueue(ze_command_queue_handle_t handle) : handle_(handle) {}
 
@@ -440,11 +444,17 @@ uint32_t L0Manager::getMinEUNumForAllDevices() const {
 }
 
 bool L0Manager::hasSharedMemoryAtomicsSupport() const {
-  return false;
+  return true;
 }
 
 size_t L0Manager::getMinSharedMemoryPerBlockForAllDevices() const {
-  return 0;
+  auto comp = [](const auto& a, const auto& b) {
+    return a->maxSharedLocalMemory() < b->maxSharedLocalMemory();
+  };
+  return std::min_element(
+             drivers_[0]->devices().begin(), drivers_[0]->devices().end(), comp)
+      ->get()
+      ->maxSharedLocalMemory();
 };
 
 }  // namespace l0
diff --git a/omniscidb/L0Mgr/L0Mgr.h b/omniscidb/L0Mgr/L0Mgr.h
@@ -79,6 +79,7 @@ class L0Device {
   L0Device(const L0Driver& driver, ze_device_handle_t device);
   uint32_t maxGroupCount() const;
   uint32_t maxGroupSize() const;
+  uint32_t maxSharedLocalMemory() const;
   ze_device_handle_t device() const;
   ze_context_handle_t ctx() const;
   ~L0Device();

diff --git a/omniscidb/QueryEngine/CMakeLists.txt b/omniscidb/QueryEngine/CMakeLists.txt
@@ -175,6 +175,7 @@ add_dependencies(QueryEngine QueryEngineFunctionsTargets)
 
 set(cpu_runtime_function_sources RuntimeFunctions.cpp DateAdd.cpp DateTruncate.cpp)
 set(intel_gpu_runtime_function_sources l0_mapd_rt.cpp DateAdd.cpp DateTruncate.cpp)
+set(intel_gpu_helpers_sources genx.cpp)
 
 
 set(hdk_default_runtime_functions_module_dependencies
@@ -250,15 +251,20 @@ link_runtime_module(${intel_gpu_module_name} "${intel_gpu_precompiled_module_lis
 link_runtime_module(${cpu_module_name} "${cpu_precompiled_module_list}")
 
 # SPIRV helper functions & intrinsics
-set(spirv_helper_functions_module genx.bc)
+set(spirv_helper_functions_module genx_impl.bc)
 add_custom_command(
     DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/Compiler/genx.ll
-    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/genx.bc
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${spirv_helper_functions_module}
     COMMAND ${llvm_as_cmd} ARGS ${CMAKE_CURRENT_SOURCE_DIR}/Compiler/genx.ll -o ${CMAKE_CURRENT_BINARY_DIR}/${spirv_helper_functions_module}
 )
 
+set(spirv_runtime_module genx.bc)
+precompile_modules("genx_mod_lst" ${intel_gpu_module_internal_suffix} ${precompile_intel_gpu_module_cmd} Compiler/genx.cpp)
+list(APPEND genx_mod_lst ${spirv_helper_functions_module})
+link_runtime_module(${spirv_runtime_module} "${genx_mod_lst}")
+
 if(ENABLE_L0)
-    add_custom_target(IntelGPURuntimeModule DEPENDS ${intel_gpu_module_name} ${spirv_helper_functions_module})
+    add_custom_target(IntelGPURuntimeModule DEPENDS ${intel_gpu_module_name} ${spirv_runtime_module})
     add_dependencies(QueryEngine IntelGPURuntimeModule)
 endif()
 
@@ -290,7 +296,7 @@ set(query_engine_install_artefacts
 if(ENABLE_L0)
     list(APPEND query_engine_install_artefacts
         ${CMAKE_CURRENT_BINARY_DIR}/${intel_gpu_module_name}
-        ${CMAKE_CURRENT_BINARY_DIR}/${spirv_helper_functions_module})
+        ${CMAKE_CURRENT_BINARY_DIR}/${spirv_runtime_module})
 endif()
 install(FILES ${query_engine_install_artefacts} DESTINATION QueryEngine COMPONENT "QE")
 

diff --git a/omniscidb/QueryEngine/Compiler/Backend.cpp b/omniscidb/QueryEngine/Compiler/Backend.cpp
@@ -918,11 +918,36 @@ void replace_function(llvm::Module* from, llvm::Module* to, const std::string& f
         auto new_call = llvm::CallInst::Create(local_callee, args, call->getName());
 
         llvm::ReplaceInstWithInst(call, new_call);
+        inst = new_call;
+      }
+      for (unsigned op_idx = 0; op_idx < inst->getNumOperands(); ++op_idx) {
+        auto op = inst->getOperand(op_idx);
+        if (auto* global = llvm::dyn_cast<llvm::GlobalVariable>(op)) {
+          auto local_global = to->getGlobalVariable(global->getName(), true);
+          CHECK(local_global);
+          inst->setOperand(op_idx, local_global);
+        }
       }
     }
   }
 }
 
+void insert_globals(llvm::Module* from, llvm::Module* to) {
+  for (const llvm::GlobalVariable& I : from->globals()) {
+    llvm::GlobalVariable* new_gv =
+        new llvm::GlobalVariable(*to,
+                                 I.getValueType(),
+                                 I.isConstant(),
+                                 I.getLinkage(),
+                                 (llvm::Constant*)nullptr,
+                                 I.getName(),
+                                 (llvm::GlobalVariable*)nullptr,
+                                 I.getThreadLocalMode(),
+                                 I.getType()->getAddressSpace());
+    new_gv->copyAttributesFrom(&I);
+  }
+}
+
 std::shared_ptr<L0CompilationContext> L0Backend::generateNativeGPUCode(
     const std::map<ExtModuleKinds, std::unique_ptr<llvm::Module>>& exts,
     llvm::Function* func,
@@ -940,6 +965,7 @@ std::shared_ptr<L0CompilationContext> L0Backend::generateNativeGPUCode(
 
   CHECK(exts.find(ExtModuleKinds::spirv_helper_funcs_module) != exts.end());
 
+  insert_globals(exts.at(ExtModuleKinds::spirv_helper_funcs_module).get(), module);
   for (auto& F : *(exts.at(ExtModuleKinds::spirv_helper_funcs_module))) {
     insert_declaration(exts.at(ExtModuleKinds::spirv_helper_funcs_module).get(),
                        module,
@@ -1063,7 +1089,6 @@ std::shared_ptr<Backend> getBackend(
       if (gpu_target.gpu_mgr->getPlatform() == GpuMgrPlatform::CUDA)
         return std::make_shared<CUDABackend>(exts, is_gpu_smem_used_, gpu_target);
       if (gpu_target.gpu_mgr->getPlatform() == GpuMgrPlatform::L0) {
-        CHECK(!is_gpu_smem_used_);
         return std::make_shared<L0Backend>(exts, gpu_target);
       }
     default:
@@ -1080,11 +1105,7 @@ void setSharedMemory(ExecutorDeviceType dt,
     case ExecutorDeviceType::CPU:
       return;
     case ExecutorDeviceType::GPU:
-      if (gpu_target.gpu_mgr->getPlatform() == GpuMgrPlatform::CUDA)
-        backend->setSharedMemory(is_gpu_smem_used_);
-      if (gpu_target.gpu_mgr->getPlatform() == GpuMgrPlatform::L0) {
-        CHECK(!is_gpu_smem_used_);
-      }
+      backend->setSharedMemory(is_gpu_smem_used_);
       return;
     default:
       CHECK(false);

diff --git a/omniscidb/QueryEngine/Compiler/CommonRuntimeDefs.h b/omniscidb/QueryEngine/Compiler/CommonRuntimeDefs.h
@@ -0,0 +1,20 @@
+/**
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include "Shared/funcannotations.h"
+
+template <class T>
+struct remove_addr_space {
+  typedef T type;
+};
+
+#ifdef L0_RUNTIME_ENABLED
+template <class T>
+struct remove_addr_space<GENERIC_ADDR_SPACE T> {
+  typedef T type;
+};
+#endif
diff --git a/omniscidb/QueryEngine/Compiler/HelperFunctions.cpp b/omniscidb/QueryEngine/Compiler/HelperFunctions.cpp
@@ -70,6 +70,7 @@ void verify_function_ir(const llvm::Function* func) {
     err_os << "\n-----\n";
     func->print(err_os, nullptr);
     err_os << "\n-----\n";
+    DUMP_MODULE(func->getParent(), "invalid.ll");
     LOG(FATAL) << err_ss.str();
   }
 }

diff --git a/omniscidb/QueryEngine/Compiler/genx.cpp b/omniscidb/QueryEngine/Compiler/genx.cpp
@@ -0,0 +1,107 @@
+/**
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <algorithm>
+#include <cstdint>
+
+#include "Shared/funcannotations.h"
+
+extern "C" {
+int64_t atomic_cas_int_64(GENERIC_ADDR_SPACE int64_t*, int64_t, int64_t);
+int32_t atomic_cas_int_32(GENERIC_ADDR_SPACE int32_t*, int32_t, int32_t);
+int64_t atomic_xchg_int_64(GENERIC_ADDR_SPACE int64_t*, int64_t);
+int32_t atomic_xchg_int_32(GENERIC_ADDR_SPACE int32_t*, int32_t);
+double atomic_min_double(GENERIC_ADDR_SPACE double* addr, const double val);
+double atomic_min_float(GENERIC_ADDR_SPACE float* addr, const float val);
+double atomic_max_double(GENERIC_ADDR_SPACE double* addr, const double val);
+double atomic_max_float(GENERIC_ADDR_SPACE float* addr, const float val);
+
+void agg_max_shared(GENERIC_ADDR_SPACE int64_t* agg, const int64_t val);
+int64_t agg_count_shared(GENERIC_ADDR_SPACE int64_t* agg, const int64_t val);
+uint32_t agg_count_int32_shared(GENERIC_ADDR_SPACE uint32_t* agg, const int32_t val);
+
+const GENERIC_ADDR_SPACE int64_t* init_shared_mem_nop(
+    const GENERIC_ADDR_SPACE int64_t* groups_buffer,
+    const int32_t groups_buffer_size) {
+  return groups_buffer;
+}
+
+// TODO: these are almost the same in cuda, move to a single source
+#define DEF_AGG_ID_INT_SHARED(n)                                             \
+  extern "C" void agg_id_int##n##_shared(GENERIC_ADDR_SPACE int##n##_t* agg, \
+                                         const int##n##_t val) {             \
+    *agg = val;                                                              \
+  }
+
+DEF_AGG_ID_INT_SHARED(32)
+DEF_AGG_ID_INT_SHARED(16)
+DEF_AGG_ID_INT_SHARED(8)
+
+#undef DEF_AGG_ID_INT_SHARED
+
+void agg_id_float_shared(GENERIC_ADDR_SPACE int32_t* agg, const float val) {
+  *reinterpret_cast<GENERIC_ADDR_SPACE float*>(agg) = val;
+}
+
+void agg_id_double_shared(GENERIC_ADDR_SPACE int64_t* agg, const double val) {
+  *reinterpret_cast<GENERIC_ADDR_SPACE double*>(agg) = val;
+}
+
+uint32_t agg_count_float_shared(GENERIC_ADDR_SPACE uint32_t* agg, const float val) {
+  return agg_count_int32_shared(agg, val);
+}
+
+int64_t agg_count_double_shared(GENERIC_ADDR_SPACE int64_t* agg, const double val) {
+  return agg_count_shared(agg, static_cast<int64_t>(val));
+}
+
+void agg_min_float_shared(GENERIC_ADDR_SPACE int32_t* agg, const float val) {
+  atomic_min_float(reinterpret_cast<GENERIC_ADDR_SPACE float*>(agg), val);
+}
+
+void agg_min_double_shared(GENERIC_ADDR_SPACE int64_t* agg, const double val) {
+  atomic_min_double(reinterpret_cast<GENERIC_ADDR_SPACE double*>(agg), val);
+}
+
+void agg_min_float_skip_val_shared(GENERIC_ADDR_SPACE int32_t* agg,
+                                   const float val,
+                                   const float skip_val) {
+  if (val != skip_val) {
+    agg_min_float_shared(agg, val);
+  }
+}
+
+void agg_min_double_skip_val_shared(GENERIC_ADDR_SPACE int64_t* agg,
+                                    const double val,
+                                    const double skip_val) {
+  if (val != skip_val) {
+    agg_min_double_shared(agg, val);
+  }
+}
+
+void agg_max_float_shared(GENERIC_ADDR_SPACE int32_t* agg, const float val) {
+  atomic_max_float(reinterpret_cast<GENERIC_ADDR_SPACE float*>(agg), val);
+}
+void agg_max_double_shared(GENERIC_ADDR_SPACE int64_t* agg, const double val) {
+  atomic_max_double(reinterpret_cast<GENERIC_ADDR_SPACE double*>(agg), val);
+}
+
+void agg_max_float_skip_val_shared(GENERIC_ADDR_SPACE int32_t* agg,
+                                   const float val,
+                                   const float skip_val) {
+  if (val != skip_val) {
+    agg_max_float_shared(agg, val);
+  }
+}
+
+void agg_max_double_skip_val_shared(GENERIC_ADDR_SPACE int64_t* agg,
+                                    const double val,
+                                    const double skip_val) {
+  if (val != skip_val) {
+    agg_max_double_shared(agg, val);
+  }
+}
+}