Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix nvrtc usage #60943

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions paddle/fluid/platform/dynload/nvrtc.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ extern bool HasNVRTC();
__macro(nvrtcCompileProgram); \
__macro(nvrtcCreateProgram); \
__macro(nvrtcDestroyProgram); \
__macro(nvrtcGetCUBIN); \
__macro(nvrtcGetCUBINSize); \
__macro(nvrtcGetPTX); \
__macro(nvrtcGetPTXSize); \
__macro(nvrtcGetProgramLog); \
Expand Down
18 changes: 9 additions & 9 deletions paddle/phi/backends/device_code.cc
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ bool GPUDeviceCode::Compile(bool include_path) {
DeviceContextPool::Instance().Get(place_));
int compute_capability = dev_ctx->GetComputeCapability();
std::string compute_flag =
"--gpu-architecture=compute_" + std::to_string(compute_capability);
"--gpu-architecture=sm_" + std::to_string(compute_capability);
std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-gpu-architecture=sm_XX:这个选项会让编译器直接生成目标GPU架构的机器码(SASS)。使用sm_XX可以避免运行时的JIT编译步骤,但是这样生成的代码只能在相同架构的GPU上运行。使用compute_XX可以使得生成的代码在不同架构的GPU上都能运行。。有考虑过这个问题吗?

std::string include_option;
if (include_path) {
Expand Down Expand Up @@ -369,15 +369,15 @@ bool GPUDeviceCode::Compile(bool include_path) {
return false;
}

// Obtain PTX from the program
size_t ptx_size;
if (!CheckNVRTCResult(dynload::nvrtcGetPTXSize(program, &ptx_size),
"nvrtcGetPTXSize")) {
// Obtain cubin from the program
size_t cubin_size;
if (!CheckNVRTCResult(dynload::nvrtcGetCUBINSize(program, &cubin_size),
"nvrtcGetCUBINSize")) {
return false;
}
ptx_.resize(ptx_size + 1);
if (!CheckNVRTCResult(dynload::nvrtcGetPTX(program, ptx_.data()),
"nvrtcGetPTX")) {
cubin_.resize(cubin_size + 1);
if (!CheckNVRTCResult(dynload::nvrtcGetCUBIN(program, cubin_.data()),
"nvrtcGetCUBIN")) {
return false;
}

Expand All @@ -386,7 +386,7 @@ bool GPUDeviceCode::Compile(bool include_path) {
return false;
}

if (!CheckCUDADriverResult(dynload::cuModuleLoadData(&module_, ptx_.data()),
if (!CheckCUDADriverResult(dynload::cuModuleLoadData(&module_, cubin_.data()),
"cuModuleLoadData",
name_)) {
return false;
Expand Down
1 change: 1 addition & 0 deletions paddle/phi/backends/device_code.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class GPUDeviceCode : public DeviceCode {
int max_threads_{0};
int num_threads_{1024};
int workload_per_thread_{1};
std::vector<char> cubin_;
std::vector<char> ptx_;
#ifdef PADDLE_WITH_HIP
hipModule_t module_;
Expand Down
2 changes: 2 additions & 0 deletions paddle/phi/backends/dynload/nvrtc.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ extern bool HasNVRTC();
__macro(nvrtcCompileProgram); \
__macro(nvrtcCreateProgram); \
__macro(nvrtcDestroyProgram); \
__macro(nvrtcGetCUBIN); \
__macro(nvrtcGetCUBINSize); \
__macro(nvrtcGetPTX); \
__macro(nvrtcGetPTXSize); \
__macro(nvrtcGetProgramLog); \
Expand Down