Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Enable shared memory by default #534

Merged
merged 9 commits into from
Jun 28, 2023
14 changes: 12 additions & 2 deletions omniscidb/L0Mgr/L0Mgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "Logger/Logger.h"
#include "Utils.h"

#include <algorithm>
#include <iostream>
#include <limits>

Expand Down Expand Up @@ -183,6 +184,9 @@ uint32_t L0Device::maxGroupCount() const {
uint32_t L0Device::maxGroupSize() const {
return compute_props_.maxGroupSizeX;
}
unsigned L0Device::maxSharedLocalMemory() const {
return compute_props_.maxSharedLocalMemory;
}

L0CommandQueue::L0CommandQueue(ze_command_queue_handle_t handle) : handle_(handle) {}

Expand Down Expand Up @@ -440,11 +444,17 @@ uint32_t L0Manager::getMinEUNumForAllDevices() const {
}

bool L0Manager::hasSharedMemoryAtomicsSupport() const {
return false;
return true;
}

size_t L0Manager::getMinSharedMemoryPerBlockForAllDevices() const {
return 0;
auto comp = [](const auto& a, const auto& b) {
return a->maxSharedLocalMemory() < b->maxSharedLocalMemory();
};
return std::min_element(
drivers_[0]->devices().begin(), drivers_[0]->devices().end(), comp)
->get()
->maxSharedLocalMemory();
};

} // namespace l0
1 change: 1 addition & 0 deletions omniscidb/L0Mgr/L0Mgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class L0Device {
L0Device(const L0Driver& driver, ze_device_handle_t device);
uint32_t maxGroupCount() const;
uint32_t maxGroupSize() const;
uint32_t maxSharedLocalMemory() const;
ze_device_handle_t device() const;
ze_context_handle_t ctx() const;
~L0Device();
Expand Down
14 changes: 10 additions & 4 deletions omniscidb/QueryEngine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ add_dependencies(QueryEngine QueryEngineFunctionsTargets)

set(cpu_runtime_function_sources RuntimeFunctions.cpp DateAdd.cpp DateTruncate.cpp)
set(intel_gpu_runtime_function_sources l0_mapd_rt.cpp DateAdd.cpp DateTruncate.cpp)
set(intel_gpu_helpers_sources genx.cpp)


set(hdk_default_runtime_functions_module_dependencies
Expand Down Expand Up @@ -250,15 +251,20 @@ link_runtime_module(${intel_gpu_module_name} "${intel_gpu_precompiled_module_lis
link_runtime_module(${cpu_module_name} "${cpu_precompiled_module_list}")

# SPIRV helper functions & intrinsics
set(spirv_helper_functions_module genx.bc)
set(spirv_helper_functions_module genx_impl.bc)
add_custom_command(
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/Compiler/genx.ll
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/genx.bc
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${spirv_helper_functions_module}
COMMAND ${llvm_as_cmd} ARGS ${CMAKE_CURRENT_SOURCE_DIR}/Compiler/genx.ll -o ${CMAKE_CURRENT_BINARY_DIR}/${spirv_helper_functions_module}
)

set(spirv_runtime_module genx.bc)
precompile_modules("genx_mod_lst" ${intel_gpu_module_internal_suffix} ${precompile_intel_gpu_module_cmd} Compiler/genx.cpp)
list(APPEND genx_mod_lst ${spirv_helper_functions_module})
link_runtime_module(${spirv_runtime_module} "${genx_mod_lst}")

if(ENABLE_L0)
add_custom_target(IntelGPURuntimeModule DEPENDS ${intel_gpu_module_name} ${spirv_helper_functions_module})
add_custom_target(IntelGPURuntimeModule DEPENDS ${intel_gpu_module_name} ${spirv_runtime_module})
add_dependencies(QueryEngine IntelGPURuntimeModule)
endif()

Expand Down Expand Up @@ -290,7 +296,7 @@ set(query_engine_install_artefacts
if(ENABLE_L0)
list(APPEND query_engine_install_artefacts
${CMAKE_CURRENT_BINARY_DIR}/${intel_gpu_module_name}
${CMAKE_CURRENT_BINARY_DIR}/${spirv_helper_functions_module})
${CMAKE_CURRENT_BINARY_DIR}/${spirv_runtime_module})
endif()
install(FILES ${query_engine_install_artefacts} DESTINATION QueryEngine COMPONENT "QE")

Expand Down
33 changes: 27 additions & 6 deletions omniscidb/QueryEngine/Compiler/Backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -918,11 +918,36 @@ void replace_function(llvm::Module* from, llvm::Module* to, const std::string& f
auto new_call = llvm::CallInst::Create(local_callee, args, call->getName());

llvm::ReplaceInstWithInst(call, new_call);
inst = new_call;
}
for (unsigned op_idx = 0; op_idx < inst->getNumOperands(); ++op_idx) {
auto op = inst->getOperand(op_idx);
if (auto* global = llvm::dyn_cast<llvm::GlobalVariable>(op)) {
auto local_global = to->getGlobalVariable(global->getName(), true);
CHECK(local_global);
inst->setOperand(op_idx, local_global);
}
}
}
}
}

void insert_globals(llvm::Module* from, llvm::Module* to) {
for (const llvm::GlobalVariable& I : from->globals()) {
llvm::GlobalVariable* new_gv =
new llvm::GlobalVariable(*to,
I.getValueType(),
I.isConstant(),
I.getLinkage(),
(llvm::Constant*)nullptr,
I.getName(),
(llvm::GlobalVariable*)nullptr,
I.getThreadLocalMode(),
I.getType()->getAddressSpace());
new_gv->copyAttributesFrom(&I);
}
}

std::shared_ptr<L0CompilationContext> L0Backend::generateNativeGPUCode(
const std::map<ExtModuleKinds, std::unique_ptr<llvm::Module>>& exts,
llvm::Function* func,
Expand All @@ -940,6 +965,7 @@ std::shared_ptr<L0CompilationContext> L0Backend::generateNativeGPUCode(

CHECK(exts.find(ExtModuleKinds::spirv_helper_funcs_module) != exts.end());

insert_globals(exts.at(ExtModuleKinds::spirv_helper_funcs_module).get(), module);
for (auto& F : *(exts.at(ExtModuleKinds::spirv_helper_funcs_module))) {
insert_declaration(exts.at(ExtModuleKinds::spirv_helper_funcs_module).get(),
module,
Expand Down Expand Up @@ -1063,7 +1089,6 @@ std::shared_ptr<Backend> getBackend(
if (gpu_target.gpu_mgr->getPlatform() == GpuMgrPlatform::CUDA)
return std::make_shared<CUDABackend>(exts, is_gpu_smem_used_, gpu_target);
if (gpu_target.gpu_mgr->getPlatform() == GpuMgrPlatform::L0) {
CHECK(!is_gpu_smem_used_);
return std::make_shared<L0Backend>(exts, gpu_target);
}
default:
Expand All @@ -1080,11 +1105,7 @@ void setSharedMemory(ExecutorDeviceType dt,
case ExecutorDeviceType::CPU:
return;
case ExecutorDeviceType::GPU:
if (gpu_target.gpu_mgr->getPlatform() == GpuMgrPlatform::CUDA)
backend->setSharedMemory(is_gpu_smem_used_);
if (gpu_target.gpu_mgr->getPlatform() == GpuMgrPlatform::L0) {
CHECK(!is_gpu_smem_used_);
}
backend->setSharedMemory(is_gpu_smem_used_);
return;
default:
CHECK(false);
Expand Down
20 changes: 20 additions & 0 deletions omniscidb/QueryEngine/Compiler/CommonRuntimeDefs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/**
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once

#include "Shared/funcannotations.h"

template <class T>
struct remove_addr_space {
typedef T type;
};

#ifdef L0_RUNTIME_ENABLED
template <class T>
struct remove_addr_space<GENERIC_ADDR_SPACE T> {
typedef T type;
};
#endif
1 change: 1 addition & 0 deletions omniscidb/QueryEngine/Compiler/HelperFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ void verify_function_ir(const llvm::Function* func) {
err_os << "\n-----\n";
func->print(err_os, nullptr);
err_os << "\n-----\n";
DUMP_MODULE(func->getParent(), "invalid.ll");
LOG(FATAL) << err_ss.str();
}
}
Expand Down
107 changes: 107 additions & 0 deletions omniscidb/QueryEngine/Compiler/genx.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/**
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0
*/

#include <algorithm>
#include <cstdint>

#include "Shared/funcannotations.h"

extern "C" {
int64_t atomic_cas_int_64(GENERIC_ADDR_SPACE int64_t*, int64_t, int64_t);
int32_t atomic_cas_int_32(GENERIC_ADDR_SPACE int32_t*, int32_t, int32_t);
int64_t atomic_xchg_int_64(GENERIC_ADDR_SPACE int64_t*, int64_t);
int32_t atomic_xchg_int_32(GENERIC_ADDR_SPACE int32_t*, int32_t);
double atomic_min_double(GENERIC_ADDR_SPACE double* addr, const double val);
double atomic_min_float(GENERIC_ADDR_SPACE float* addr, const float val);
double atomic_max_double(GENERIC_ADDR_SPACE double* addr, const double val);
double atomic_max_float(GENERIC_ADDR_SPACE float* addr, const float val);

void agg_max_shared(GENERIC_ADDR_SPACE int64_t* agg, const int64_t val);
int64_t agg_count_shared(GENERIC_ADDR_SPACE int64_t* agg, const int64_t val);
uint32_t agg_count_int32_shared(GENERIC_ADDR_SPACE uint32_t* agg, const int32_t val);

const GENERIC_ADDR_SPACE int64_t* init_shared_mem_nop(
const GENERIC_ADDR_SPACE int64_t* groups_buffer,
const int32_t groups_buffer_size) {
return groups_buffer;
}

// TODO: these are almost the same in cuda, move to a single source
#define DEF_AGG_ID_INT_SHARED(n) \
extern "C" void agg_id_int##n##_shared(GENERIC_ADDR_SPACE int##n##_t* agg, \
const int##n##_t val) { \
*agg = val; \
}

DEF_AGG_ID_INT_SHARED(32)
DEF_AGG_ID_INT_SHARED(16)
DEF_AGG_ID_INT_SHARED(8)

#undef DEF_AGG_ID_INT_SHARED

void agg_id_float_shared(GENERIC_ADDR_SPACE int32_t* agg, const float val) {
*reinterpret_cast<GENERIC_ADDR_SPACE float*>(agg) = val;
}

void agg_id_double_shared(GENERIC_ADDR_SPACE int64_t* agg, const double val) {
*reinterpret_cast<GENERIC_ADDR_SPACE double*>(agg) = val;
}

uint32_t agg_count_float_shared(GENERIC_ADDR_SPACE uint32_t* agg, const float val) {
return agg_count_int32_shared(agg, val);
}

int64_t agg_count_double_shared(GENERIC_ADDR_SPACE int64_t* agg, const double val) {
return agg_count_shared(agg, static_cast<int64_t>(val));
}

void agg_min_float_shared(GENERIC_ADDR_SPACE int32_t* agg, const float val) {
atomic_min_float(reinterpret_cast<GENERIC_ADDR_SPACE float*>(agg), val);
}

void agg_min_double_shared(GENERIC_ADDR_SPACE int64_t* agg, const double val) {
atomic_min_double(reinterpret_cast<GENERIC_ADDR_SPACE double*>(agg), val);
}

void agg_min_float_skip_val_shared(GENERIC_ADDR_SPACE int32_t* agg,
const float val,
const float skip_val) {
if (val != skip_val) {
agg_min_float_shared(agg, val);
}
}

void agg_min_double_skip_val_shared(GENERIC_ADDR_SPACE int64_t* agg,
const double val,
const double skip_val) {
if (val != skip_val) {
agg_min_double_shared(agg, val);
}
}

void agg_max_float_shared(GENERIC_ADDR_SPACE int32_t* agg, const float val) {
atomic_max_float(reinterpret_cast<GENERIC_ADDR_SPACE float*>(agg), val);
}
void agg_max_double_shared(GENERIC_ADDR_SPACE int64_t* agg, const double val) {
atomic_max_double(reinterpret_cast<GENERIC_ADDR_SPACE double*>(agg), val);
}

void agg_max_float_skip_val_shared(GENERIC_ADDR_SPACE int32_t* agg,
const float val,
const float skip_val) {
if (val != skip_val) {
agg_max_float_shared(agg, val);
}
}

void agg_max_double_skip_val_shared(GENERIC_ADDR_SPACE int64_t* agg,
const double val,
const double skip_val) {
if (val != skip_val) {
agg_max_double_shared(agg, val);
}
}
}
Loading