Can't get correct result when use cub in CUDA12.0 #719

YuanRisheng · 2023-06-19T13:19:14Z

I get error result when I use cub::DeviceReduce::Reduce in CUDA12.0. This error occurs only when build shared target. This is my code：

test_main.cc

#include <iostream>

extern float Reduce1024x100();
int main() {
  auto ret = Reduce1024x100();
  std::cout << ret << std::endl;
  return 0;
}

test_functor.h

#include <iostream>
#include <vector>
#include "cub/cub.cuh"
#include "cuda.h"

#define CUDA_CHECK(__x)                                       \
  do {                                                        \
    auto __cond = (__x);                                      \
    if (__cond != cudaSuccess) {                              \
      auto __msg = std::string(#__x) + " " + __FILE__ + ":" + \
                   std::to_string(__LINE__) + ": " +          \
                   cudaGetErrorString(__cond) + " , code " +  \
                   std::to_string(__cond);                    \
      throw std::runtime_error(__msg);                        \
    }                                                         \
  } while (0)

template <typename Tx, typename Ty = Tx>
struct IdentityFunctor {
  __host__ inline IdentityFunctor() {}

  __device__ explicit inline IdentityFunctor(int n) {}

  __device__ inline Ty operator()(const Tx x) const {
    return static_cast<Ty>(x);
  }
};

template <typename T>
struct AddFunctor {
  inline T initial() { return static_cast<T>(0.0f); }

  __device__ T operator()(const T a, const T b) const { return b + a; }
};

template <typename T>
T *CudaMalloc(size_t n) {
  if (n == 0) return nullptr;
  T *p = nullptr;
  CUDA_CHECK(cudaMalloc(&p, n * sizeof(T)));
  return p;
}

test_moduleA.cu

#include "test_functor.h"

void CudaFree(void *p) {
  if (p == nullptr) return;
  CUDA_CHECK(cudaFree(p));
}
template <typename T>
T TestMain(const std::vector<T> &cpu) {
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  size_t n = cpu.size();
  auto *gpu = CudaMalloc<T>(n);
  CUDA_CHECK(cudaMemcpyAsync(
      gpu, cpu.data(), n * sizeof(T), cudaMemcpyHostToDevice, stream));
  auto *gpu_ret = CudaMalloc<T>(1);
  auto addf = AddFunctor<float>();
  auto trans = IdentityFunctor<float, float>(n);
  cub::TransformInputIterator<float,
                              IdentityFunctor<float, float>,
                              const float *>
      trans_x(gpu, trans);
  size_t tmp_bytes;
  CUDA_CHECK(cub::DeviceReduce::Reduce(
      nullptr, tmp_bytes, trans_x, gpu_ret, n, addf, 0.0f, stream));
  std::cout << "tmp_bytes:" << tmp_bytes << std::endl;
  uint8_t *gpu_tmp = CudaMalloc<uint8_t>(tmp_bytes);
  CUDA_CHECK(cub::DeviceReduce::Reduce(
      gpu_tmp, tmp_bytes, trans_x, gpu_ret, n, addf, 0.0f, stream));

  T cpu_ret;
  CUDA_CHECK(cudaMemcpyAsync(
      &cpu_ret, gpu_ret, sizeof(T), cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaStreamSynchronize(stream));
  CUDA_CHECK(cudaStreamDestroy(stream));

  CudaFree(gpu);
  CudaFree(gpu_ret);
  CudaFree(gpu_tmp);
  return cpu_ret;
}

float Reduce1024x100() {
  std::cout << "CUB version : " << CUB_VERSION << std::endl;
  std::vector<float> data(1024 * 100, 1);
  auto ret = TestMain(data);
  return ret;
}

void UseModuleA() {}

test_moduleB.cu

#include "test_functor.h"
extern void UseModuleA();

float NoUse() {
  cudaStream_t stream;
  auto *gpu = CudaMalloc<float>(1);
  auto *gpu_ret = CudaMalloc<float>(1);
  auto addf = AddFunctor<float>();
  auto trans = IdentityFunctor<float, float>();
  cub::TransformInputIterator<float,
                              IdentityFunctor<float, float>,
                              const float *>
      trans_x(gpu, trans);
  size_t tmp_arg = 0;
  cub::DeviceReduce::Reduce(
      nullptr, tmp_arg, trans_x, gpu_ret, tmp_arg, addf, 0.0f, stream);

  UseModuleA();
  return 1;
}

CMakeList code:

add_library(test_moduleA SHARED test_moduleA.cu)
add_library(test_moduleB SHARED test_moduleB.cu)
target_link_libraries(test_moduleB test_moduleA)
add_executable(test_main test_main.cc)
target_link_libraries(test_main test_moduleB)

I get zero result when run test_main in CUDA12:

But I get correct result in CUDA11.2:

Please help me to deal with this issue. Thank you!

The text was updated successfully, but these errors were encountered:

gevtushenko · 2023-06-19T14:05:15Z

Hello @YuanRisheng! I've tried CUDA 12.0 and 12.1 and the result seems fine:

-- The CUDA compiler identification is NVIDIA 12.0.140
-- The CXX compiler identification is GNU 11.3.0

CUB version : 200001
tmp_bytes:15615
102400

Your CMakeLists seems incomplete, so I've added a few lines:

project(test LANGUAGES CUDA CXX)   # <-- 1
set(CMAKE_CUDA_ARCHITECTURES "89") # <-- 2

add_library(test_moduleA SHARED test_moduleA.cu)
add_library(test_moduleB SHARED test_moduleB.cu)
target_link_libraries(test_moduleB test_moduleA)
add_executable(test_main test_main.cc)
target_link_libraries(test_main test_moduleB)

Could you validate the change? Also, which host compiler are you using and what's the exact version of nvcc and GPU architecture?

YuanRisheng · 2023-06-20T03:48:07Z

Thanks for your reply! @senior-zero I change my CMakeLists as below and it doesn't work:

project(test LANGUAGES CUDA CXX)
set(CMAKE_CUDA_ARCHITECTURES "80")

add_library(test_moduleA SHARED test_moduleA.cu)
add_library(test_moduleB SHARED test_moduleB.cu)
target_link_libraries(test_moduleB test_moduleA)
add_executable(test_main test_main.cc)
target_link_libraries(test_main test_moduleB)

I use nvcc --version and get message:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Fri_Jan__6_16:45:21_PST_2023
Cuda compilation tools, release 12.0, V12.0.140
Build cuda_12.0.r12.0/compiler.32267302_0

and nvidia-smi get message:

cmake --version get message:

cmake version 3.18.0

CMake suite maintained and supported by Kitware (kitware.com/cmake).

YuanRisheng · 2023-06-20T03:57:26Z

There is also an interesting phenomena. When I change the size of data in function Reduce1024x100, for example , I change it from 1024 x 100 to 4096 x 1, it works well. When I change size larger, it fails.

gevtushenko · 2023-06-20T09:12:19Z

@YuanRisheng I'm still unable to reproduce the issue. Could you please:

share version of the host compiler?

:cmake -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.0/bin/nvcc ..
-- The CUDA compiler identification is NVIDIA 12.0.140
-- The CXX compiler identification is GNU 11.3.0  # <--- this line

try out github version of Thrust/CUB

In case I'm unable to reproduce the issue, we might need a docker image with a reproducer.

gevtushenko · 2023-06-20T09:32:35Z

@YuanRisheng @elstehle here are the commands with docker:

docker run --gpus all -it nvidia/cuda:12.0.0-devel-ubuntu22.04
cd
apt update && apt install cmake g++ vim git
git clone https://github.com/senior-zero/reduce_repro.git
cd reduce_repro/
./run.sh

YuanRisheng · 2023-06-20T12:36:54Z

@senior-zero I used this docker image on my machine:

docker pull zhengqiwen/paddle_cuda12.0:latest-dev-cuda12.0.1-cudnn8.9.1-gcc122-ubu2004
docker run --name XXX -it zhengqiwen/paddle_cuda12.0:latest-dev-cuda12.0.1-cudnn8.9.1-gcc122-ubu2004 bash

Maybe you could reproduce this issue using this image. Thank you for your help again!

elstehle · 2023-06-20T13:03:50Z

Thank your for sharing your docker image, @YuanRisheng. Unfortunately, I am still not able to reproduce using the given image. Are you able to reproduce the issue when using your image along with @senior-zero 's reproducer?

docker pull zhengqiwen/paddle_cuda12.0:latest-dev-cuda12.0.1-cudnn8.9.1-gcc122-ubu2004
docker run --gpus all --name XXX -it zhengqiwen/paddle_cuda12.0:latest-dev-cuda12.0.1-cudnn8.9.1-gcc122-ubu2004 bash
git clone https://github.com/senior-zero/reduce_repro.git
cd reduce_repro/
./run.sh

YuanRisheng · 2023-06-20T14:28:12Z

@elstehle WoW! I get the difference. Somehow, I opened O3 optimization before. Please use cmake -DCMAKE_BUILD_TYPE=Release .. when build target. This will reproduce the issue.

elstehle · 2023-06-20T20:53:13Z

Thank you, @YuanRisheng! I am now able to reproduce the issue with the container image you had shared with us. So far I wasn't able to pin down the issue. I'll continue the investigation and will keep you posted.

elstehle · 2023-07-05T15:15:33Z

Thank you for bringing this issue to our attention, @YuanRisheng. We are exploring options to fix the issue. We will update this issue once we have a concrete solution.

YuanRisheng · 2023-07-06T11:59:13Z

Thanks for your attention! This issue blocking my work in some situation and I will continue to pay attention it!

YuanRisheng · 2023-07-21T06:02:49Z

Remind, please don't forget this issue. Thank you!

miscco · 2023-07-21T06:25:56Z

Hi @YuanRisheng

We have been quite busy recently consolidating our libraries into a single unified monorepository. That and the inherent high risk of breaking working code when changing linkage of kernels have pushed this item past the deadline for the upcoming release.

Consequently, we wont have time to work on this issue until release work is done. That said, once a fix has been implemented you will be able to directly pull it from https://github.com/NVIDIA/cccl which is our new unified monorepository

YuanRisheng · 2023-07-24T12:45:23Z

Hi @YuanRisheng

We have been quite busy recently consolidating our libraries into a single unified monorepository. That and the inherent high risk of breaking working code when changing linkage of kernels have pushed this item past the deadline for the upcoming release.

Consequently, we wont have time to work on this issue until release work is done. That said, once a fix has been implemented you will be able to directly pull it from https://github.com/NVIDIA/cccl which is our new unified monorepository

I got it and thanks!

gevtushenko · 2023-08-10T20:33:24Z

Hello @YuanRisheng!

While we are figuring out how to address the issue, you could workaround it by wrapping CUB/Thrust namespaces in each library:

test_moduleA.cu

// goes before all the headers or in target_compile_definitions
#define CUB_WRAPPED_NAMESPACE A
#define THRUST_WRAPPED_NAMESPACE A
...
CUDA_CHECK(A::cub::DeviceReduce::Reduce(nullptr, tmp_bytes, trans_x, gpu_ret, n, addf, 0.0f));

test_moduleB.cu

// goes before all the headers or in target_compile_definitions
#define CUB_WRAPPED_NAMESPACE B
#define THRUST_WRAPPED_NAMESPACE B
...
CUDA_CHECK(B::cub::DeviceReduce::Reduce(nullptr, tmp_bytes, trans_x, gpu_ret, n, addf, 0.0f));

YuanRisheng · 2023-08-11T02:24:31Z

@senior-zero I will try it! Thanks for your solution!

tianshuo78520a · 2023-09-06T07:52:06Z

Hello senior-zero

May I ask if this problem has been resolved? I tried to use the above method to solve it, but failed. I think this is a very serious bug. Can you help me take a look?

gevtushenko · 2023-09-06T15:06:48Z

@tianshuo78520a we haven't addressed the issue on our side yet. You can track the status here NVIDIA/cccl#166. The above method should work. Note that you have to wrap cub/thrust namespaces in every shared library that uses cub/thrust for now. If wrapping namespaces doesn't work for you, it might be a different problem. In this case, please, provide a reproducer.

tianshuo78520a · 2023-09-07T02:37:35Z

Thank you for your reply. We will follow this issue and try using the above method again.

gevtushenko · 2023-09-21T20:19:34Z

@YuanRisheng, @tianshuo78520a we've just merged a fix that should help address the issue when wrapped namespace is not specified. Could you please verify if it works for you?

YuanRisheng · 2023-09-22T02:33:33Z

@senior-zero Thank you for your fix. But I don't know how to use the "main" branch of cccl. The cub I have used is in cuda install dir. Do I need update cuda? or pull cccl for overriding cub in my environment?

gevtushenko · 2023-09-22T15:53:06Z

@YuanRisheng this page might help you use the latest CCCL version: https://github.com/NVIDIA/cccl/tree/main/examples/example_project

YuanRisheng · 2023-09-27T11:42:09Z

@YuanRisheng, @tianshuo78520a we've just merged a fix that should help address the issue when wrapped namespace is not specified. Could you please verify if it works for you?

@senior-zero This fix could solve my problem. Thanks for what your team has done again!

gevtushenko · 2023-09-27T14:32:55Z

@senior-zero This fix could solve my problem. Thanks for what your team has done again!

@YuanRisheng thank you for reporting the issue! Since the fix works for you, I'm closing it.

github-project-automation bot added this to CCCL Jun 19, 2023

github-project-automation bot moved this to Todo in CCCL Jun 19, 2023

gevtushenko moved this from Todo to Awaiting Feedback in CCCL Jun 19, 2023

gevtushenko added a commit to gevtushenko/reduce_repro that referenced this issue Jun 20, 2023

NVIDIA/cub#719

c16e8ab

gevtushenko assigned elstehle Jun 21, 2023

gevtushenko moved this from Awaiting Feedback to In Progress in CCCL Jun 29, 2023

gevtushenko mentioned this issue Jun 29, 2023

Benchmark batched memcpy NVIDIA/cccl#136

Merged

2 tasks

elstehle mentioned this issue Jul 5, 2023

[EPIC]: Hide the visibility of Thrust/CUB symbols in shared object libraries NVIDIA/cccl#166

Closed

1 task

gevtushenko closed this as completed Sep 27, 2023

github-project-automation bot moved this from In Progress to Done in CCCL Sep 27, 2023

YuanRisheng mentioned this issue Nov 7, 2023

[BUG]: Can't get correct result when use cub in CUDA12.0 NVIDIA/cccl#668

Closed

1 task

jrhemstad removed this from CCCL Nov 15, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Can't get correct result when use cub in CUDA12.0 #719

Can't get correct result when use cub in CUDA12.0 #719

YuanRisheng commented Jun 19, 2023 •

edited

Loading

gevtushenko commented Jun 19, 2023

YuanRisheng commented Jun 20, 2023 •

edited

Loading

YuanRisheng commented Jun 20, 2023 •

edited

Loading

gevtushenko commented Jun 20, 2023

gevtushenko commented Jun 20, 2023

YuanRisheng commented Jun 20, 2023

elstehle commented Jun 20, 2023

YuanRisheng commented Jun 20, 2023 •

edited

Loading

elstehle commented Jun 20, 2023

elstehle commented Jul 5, 2023

YuanRisheng commented Jul 6, 2023

YuanRisheng commented Jul 21, 2023

miscco commented Jul 21, 2023 •

edited

Loading

YuanRisheng commented Jul 24, 2023 •

edited

Loading

gevtushenko commented Aug 10, 2023

YuanRisheng commented Aug 11, 2023

tianshuo78520a commented Sep 6, 2023

gevtushenko commented Sep 6, 2023

tianshuo78520a commented Sep 7, 2023

gevtushenko commented Sep 21, 2023

YuanRisheng commented Sep 22, 2023

gevtushenko commented Sep 22, 2023

YuanRisheng commented Sep 27, 2023

gevtushenko commented Sep 27, 2023

Can't get correct result when use cub in CUDA12.0 #719

Can't get correct result when use cub in CUDA12.0 #719

Comments

YuanRisheng commented Jun 19, 2023 • edited Loading

test_main.cc

test_functor.h

test_moduleA.cu

test_moduleB.cu

CMakeList code:

gevtushenko commented Jun 19, 2023

YuanRisheng commented Jun 20, 2023 • edited Loading

YuanRisheng commented Jun 20, 2023 • edited Loading

gevtushenko commented Jun 20, 2023

gevtushenko commented Jun 20, 2023

YuanRisheng commented Jun 20, 2023

elstehle commented Jun 20, 2023

YuanRisheng commented Jun 20, 2023 • edited Loading

elstehle commented Jun 20, 2023

elstehle commented Jul 5, 2023

YuanRisheng commented Jul 6, 2023

YuanRisheng commented Jul 21, 2023

miscco commented Jul 21, 2023 • edited Loading

YuanRisheng commented Jul 24, 2023 • edited Loading

gevtushenko commented Aug 10, 2023

YuanRisheng commented Aug 11, 2023

tianshuo78520a commented Sep 6, 2023

gevtushenko commented Sep 6, 2023

tianshuo78520a commented Sep 7, 2023

gevtushenko commented Sep 21, 2023

YuanRisheng commented Sep 22, 2023

gevtushenko commented Sep 22, 2023

YuanRisheng commented Sep 27, 2023

gevtushenko commented Sep 27, 2023

YuanRisheng commented Jun 19, 2023 •

edited

Loading

YuanRisheng commented Jun 20, 2023 •

edited

Loading

YuanRisheng commented Jun 20, 2023 •

edited

Loading

YuanRisheng commented Jun 20, 2023 •

edited

Loading

miscco commented Jul 21, 2023 •

edited

Loading

YuanRisheng commented Jul 24, 2023 •

edited

Loading