Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Get rid of frag count overhead in prepareKernelParams #665

Merged
merged 1 commit into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions omniscidb/BufferProvider/BufferProvider.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,17 @@ class BufferProvider {
const int8_t* host_ptr,
const size_t num_bytes,
const int device_id) const = 0;

virtual void copyToDeviceAsyncIfPossible(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
const int device_id) const = 0;

virtual void copyToDeviceAsync(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
const int device_id) const = 0;
virtual void synchronizeStream(const int device_id) const = 0;
virtual void copyFromDevice(int8_t* host_ptr,
const int8_t* device_ptr,
const size_t num_bytes,
Expand Down
16 changes: 16 additions & 0 deletions omniscidb/CudaMgr/CudaMgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ CudaMgr::CudaMgr(const int num_gpus, const int start_gpu)
initDeviceGroup();
createDeviceContexts();
printDeviceProperties();
for (int device_id = 0; device_id < device_count_; device_id++) {
setContext(device_id);
checkError(cuStreamCreate(&stream_, CU_STREAM_NON_BLOCKING));
}
}

void CudaMgr::initDeviceGroup() {
Expand Down Expand Up @@ -108,6 +112,18 @@ void CudaMgr::copyHostToDevice(int8_t* device_ptr,
cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
}

void CudaMgr::copyHostToDeviceAsync(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
const int device_num) {
setContext(device_num);
checkError(cuMemcpyHtoDAsync(
reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes, stream_));
}
void CudaMgr::synchronizeStream(const int device_num) {
setContext(device_num);
checkError(cuStreamSynchronize(stream_));
}
void CudaMgr::copyDeviceToHost(int8_t* host_ptr,
const int8_t* device_ptr,
const size_t num_bytes,
Expand Down
13 changes: 12 additions & 1 deletion omniscidb/CudaMgr/CudaMgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,14 @@ class CudaMgr : public GpuMgr {
const int8_t* host_ptr,
const size_t num_bytes,
const int device_num) override;

void copyHostToDeviceAsync(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
const int device_num) override;

void synchronizeStream(const int device_num) override;

void copyDeviceToHost(int8_t* host_ptr,
const int8_t* device_ptr,
const size_t num_bytes,
Expand All @@ -117,6 +125,8 @@ class CudaMgr : public GpuMgr {
const size_t num_bytes,
const int device_num) override;

bool canLoadAsync() const override { return async_data_load_available; };

size_t getMinSharedMemoryPerBlockForAllDevices() const override {
return min_shared_memory_per_block_for_all_devices;
}
Expand Down Expand Up @@ -268,6 +278,7 @@ class CudaMgr : public GpuMgr {
void checkError(CUresult cu_result) const;

int gpu_driver_version_;
CUstream stream_;
#endif

int device_count_;
Expand All @@ -277,8 +288,8 @@ class CudaMgr : public GpuMgr {
std::vector<DeviceProperties> device_properties_;
omnisci::DeviceGroup device_group_;
std::vector<CUcontext> device_contexts_;

mutable std::mutex device_cleanup_mutex_;
static constexpr bool async_data_load_available{true};
};

} // Namespace CudaMgr_Namespace
Expand Down
12 changes: 12 additions & 0 deletions omniscidb/CudaMgr/CudaMgrNoCuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ void CudaMgr::copyHostToDevice(int8_t* device_ptr,
const int device_num) {
CHECK(false);
}

void CudaMgr::copyHostToDeviceAsync(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
const int device_num) {
CHECK(false);
}

void CudaMgr::synchronizeStream(const int device_num) {
CHECK(false);
}

void CudaMgr::copyDeviceToHost(int8_t* host_ptr,
const int8_t* device_ptr,
const size_t num_bytes,
Expand Down
6 changes: 6 additions & 0 deletions omniscidb/DataMgr/Allocators/GpuAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ Data_Namespace::AbstractBuffer* GpuAllocator::allocGpuAbstractBuffer(
return ab;
}

Data_Namespace::AbstractBuffer* GpuAllocator::allocGpuAbstractBuffer(
const size_t num_bytes) {
CHECK(buffer_provider_);
return GpuAllocator::allocGpuAbstractBuffer(buffer_provider_, num_bytes, device_id_);
}

int8_t* GpuAllocator::alloc(const size_t num_bytes) {
CHECK(buffer_provider_);
owned_buffers_.emplace_back(
Expand Down
2 changes: 2 additions & 0 deletions omniscidb/DataMgr/Allocators/GpuAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class GpuAllocator : public DeviceAllocator {
const size_t num_bytes,
const int device_id);

Data_Namespace::AbstractBuffer* allocGpuAbstractBuffer(const size_t num_bytes);

int8_t* alloc(const size_t num_bytes) override;

void free(Data_Namespace::AbstractBuffer* ab) const override;
Expand Down
31 changes: 31 additions & 0 deletions omniscidb/DataMgr/DataMgrBufferProvider.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,37 @@ void DataMgrBufferProvider::copyToDevice(int8_t* device_ptr,
gpu_mgr->copyHostToDevice(device_ptr, host_ptr, num_bytes, device_id);
}

void DataMgrBufferProvider::copyToDeviceAsync(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
const int device_id) const {
CHECK(data_mgr_);
const auto gpu_mgr = data_mgr_->getGpuMgr();
CHECK(gpu_mgr);
gpu_mgr->copyHostToDeviceAsync(device_ptr, host_ptr, num_bytes, device_id);
}

void DataMgrBufferProvider::copyToDeviceAsyncIfPossible(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
const int device_id) const {
CHECK(data_mgr_);
const auto gpu_mgr = data_mgr_->getGpuMgr();
CHECK(gpu_mgr);
if (gpu_mgr->canLoadAsync()) {
gpu_mgr->copyHostToDeviceAsync(device_ptr, host_ptr, num_bytes, device_id);
} else {
gpu_mgr->copyHostToDevice(device_ptr, host_ptr, num_bytes, device_id);
}
}

void DataMgrBufferProvider::synchronizeStream(const int device_num) const {
CHECK(data_mgr_);
const auto gpu_mgr = data_mgr_->getGpuMgr();
CHECK(gpu_mgr);
gpu_mgr->synchronizeStream(device_num);
}

void DataMgrBufferProvider::copyFromDevice(int8_t* host_ptr,
const int8_t* device_ptr,
const size_t num_bytes,
Expand Down
11 changes: 11 additions & 0 deletions omniscidb/DataMgr/DataMgrBufferProvider.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ class DataMgrBufferProvider : public BufferProvider {
const int8_t* host_ptr,
const size_t num_bytes,
const int device_id) const override;

void copyToDeviceAsyncIfPossible(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
const int device_id) const override;

void copyToDeviceAsync(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
const int device_id) const override;
void synchronizeStream(const int device_id) const override;
void copyFromDevice(int8_t* host_ptr,
const int8_t* device_ptr,
const size_t num_bytes,
Expand Down
10 changes: 10 additions & 0 deletions omniscidb/DataMgr/GpuMgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ struct GpuMgr {
const int8_t* host_ptr,
const size_t num_bytes,
const int device_num) = 0;

virtual void copyHostToDeviceAsync(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
const int device_num) = 0;

virtual void synchronizeStream(const int device_num) = 0;

virtual void copyDeviceToHost(int8_t* host_ptr,
const int8_t* device_ptr,
const size_t num_bytes,
Expand Down Expand Up @@ -63,6 +71,8 @@ struct GpuMgr {
virtual uint32_t getGridSize() const = 0;
virtual uint32_t getMinEUNumForAllDevices() const = 0;
virtual bool hasSharedMemoryAtomicsSupport() const = 0;
virtual bool canLoadAsync() const = 0;

// TODO: hasFP64Support implementations do not account for different device capabilities
virtual bool hasFP64Support() const { return true; };
virtual size_t getMinSharedMemoryPerBlockForAllDevices() const = 0;
Expand Down
14 changes: 12 additions & 2 deletions omniscidb/L0Mgr/L0Mgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,16 @@ void* allocate_device_mem(const size_t num_bytes, L0Device& device);
class L0Manager : public GpuMgr {
public:
L0Manager();

void copyHostToDeviceAsync(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
const int device_num) override {
CHECK(false);
}
void synchronizeStream(const int device_num) override {
LOG(WARNING)
<< "L0 has no async data transfer enabled, synchronizeStream() has no effect";
}
void copyHostToDevice(int8_t* device_ptr,
const int8_t* host_ptr,
const size_t num_bytes,
Expand All @@ -225,7 +234,7 @@ class L0Manager : public GpuMgr {
const unsigned char uc,
const size_t num_bytes,
const int device_num) override;

bool canLoadAsync() const override { return async_data_load_available; };
void synchronizeDevices() const override;
GpuMgrPlatform getPlatform() const override { return GpuMgrPlatform::L0; }
int getDeviceCount() const override {
Expand All @@ -252,6 +261,7 @@ class L0Manager : public GpuMgr {

private:
std::vector<std::shared_ptr<L0Driver>> drivers_;
static constexpr bool async_data_load_available{false};
};

} // namespace l0
Loading