From 1cee4fe37ea1f1e5f056731a95a69c57e54ee039 Mon Sep 17 00:00:00 2001 From: sjfeng1999 Date: Sun, 28 Aug 2022 02:18:05 +0800 Subject: [PATCH 1/2] Fix wrong tid when blockDim.y or z > 1 --- cub/block/block_shuffle.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/block/block_shuffle.cuh b/cub/block/block_shuffle.cuh index e4ebc7ff1e..58938301c1 100644 --- a/cub/block/block_shuffle.cuh +++ b/cub/block/block_shuffle.cuh @@ -190,7 +190,7 @@ public: CTA_SYNC(); - unsigned int offset = threadIdx.x + distance; + unsigned int offset = linear_tid + distance; if (offset >= BLOCK_THREADS) offset -= BLOCK_THREADS; From 92935e0f9f6553eacd2a8fbaaf006250fc0c5ce3 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 29 Aug 2022 08:43:04 +0400 Subject: [PATCH 2/2] Test block shuffle in multi-dim blocks --- test/test_block_shuffle.cu | 171 +++++++++++++++++++++---------------- 1 file changed, 99 insertions(+), 72 deletions(-) diff --git a/test/test_block_shuffle.cu b/test/test_block_shuffle.cu index c661f8c719..e951d897eb 100644 --- a/test/test_block_shuffle.cu +++ b/test/test_block_shuffle.cu @@ -73,20 +73,22 @@ void Iota( template < typename DataType, - unsigned int ThreadsInBlock, + unsigned int BlockDimX, + unsigned int BlockDimY, + unsigned int BlockDimZ, unsigned int ItemsPerThread, typename ActionType> __global__ void BlockShuffleTestKernel( DataType *data, ActionType action) { - typedef cub::BlockShuffle BlockShuffle; + typedef cub::BlockShuffle BlockShuffle; __shared__ typename BlockShuffle::TempStorage temp_storage_shuffle; DataType thread_data[ItemsPerThread]; - data += threadIdx.x * ItemsPerThread; + data += cub::RowMajorTid(BlockDimX, BlockDimY, BlockDimZ) * ItemsPerThread; for (unsigned int item = 0; item < ItemsPerThread; item++) { thread_data[item] = data[item]; @@ -105,11 +107,14 @@ __global__ void BlockShuffleTestKernel( template< typename DataType, unsigned int ItemsPerThread, - unsigned int ThreadsInBlock, + unsigned int BlockDimX, + unsigned int BlockDimY, + unsigned int BlockDimZ, typename ActionType> void BlockShuffleTest(DataType *data, ActionType action) { - BlockShuffleTestKernel<<<1, ThreadsInBlock>>> (data, action); + dim3 block(BlockDimX, BlockDimY, BlockDimZ); + BlockShuffleTestKernel<<<1, block>>> (data, action); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); @@ -118,11 +123,13 @@ void BlockShuffleTest(DataType *data, ActionType action) template < typename DataType, unsigned int ItemsPerThread, - unsigned int ThreadsInBlock> + unsigned int BlockDimX, + unsigned int BlockDimY, + unsigned int BlockDimZ> struct UpTest { __device__ void operator()( - BlockShuffle &block_shuffle, + BlockShuffle &block_shuffle, DataType (&thread_data)[ItemsPerThread]) const { block_shuffle.Up(thread_data, thread_data); @@ -142,11 +149,13 @@ struct UpTest template < typename DataType, unsigned int ItemsPerThread, - unsigned int ThreadsInBlock> + unsigned int BlockDimX, + unsigned int BlockDimY, + unsigned int BlockDimZ> struct DownTest { __device__ void operator()( - BlockShuffle &block_shuffle, + BlockShuffle &block_shuffle, DataType (&thread_data)[ItemsPerThread]) const { block_shuffle.Down(thread_data, thread_data); @@ -154,7 +163,7 @@ struct DownTest static __host__ bool check(const DataType *data, int i) { - if (i == ItemsPerThread * ThreadsInBlock - 1) + if (i == ItemsPerThread * BlockDimX * BlockDimY * BlockDimZ - 1) { return data[i] == i; } @@ -164,14 +173,16 @@ struct DownTest }; template struct OffsetTestBase { static constexpr unsigned int ItemsPerThread = 1; __device__ void operator()( - BlockShuffle &block_shuffle, + BlockShuffle &block_shuffle, DataType (&thread_data)[ItemsPerThread]) const { block_shuffle.Offset(thread_data[0], thread_data[0], offset); @@ -179,34 +190,40 @@ struct OffsetTestBase }; template -struct OffsetUpTest : public OffsetTestBase + unsigned int BlockDimX, + unsigned int BlockDimY, + unsigned int BlockDimZ> +struct OffsetUpTest : public OffsetTestBase { static __host__ bool check(const DataType *data, int i) { - return UpTest::check (data, i); + return UpTest::check (data, i); } }; template -struct OffsetDownTest : public OffsetTestBase + unsigned int BlockDimX, + unsigned int BlockDimY, + unsigned int BlockDimZ> +struct OffsetDownTest : public OffsetTestBase { static __host__ bool check(const DataType *data, int i) { - return DownTest::check (data, i); + return DownTest::check (data, i); } }; template struct RotateTestBase { static constexpr unsigned int ItemsPerThread = 1; __device__ void operator()( - BlockShuffle &block_shuffle, + BlockShuffle &block_shuffle, DataType (&thread_data)[ItemsPerThread]) const { block_shuffle.Rotate(thread_data[0], thread_data[0], offset); @@ -214,18 +231,22 @@ struct RotateTestBase static __host__ bool check(const DataType *data, int i) { - return data[i] == static_cast((i + offset) % ThreadsInBlock); + return data[i] == static_cast((i + offset) % (BlockDimX * BlockDimY * BlockDimZ)); } }; template -struct RotateUpTest : public RotateTestBase + unsigned int BlockDimX, + unsigned int BlockDimY, + unsigned int BlockDimZ> +struct RotateUpTest : public RotateTestBase { }; template -struct RotateTest : public RotateTestBase + unsigned int BlockDimX, + unsigned int BlockDimY, + unsigned int BlockDimZ> +struct RotateTest : public RotateTestBase { }; @@ -252,16 +273,18 @@ int CheckResult( template < typename DataType, unsigned int ItemsPerThread, - unsigned int ThreadsInBlock, - template class TestType> + unsigned int BlockDimX, + unsigned int BlockDimY, + unsigned int BlockDimZ, + template class TestType> void Test(unsigned int num_items, DataType *d_data, DataType *h_data) { - TestType test; + TestType test; Iota(num_items, d_data); - BlockShuffleTest(d_data, test); + BlockShuffleTest(d_data, test); AssertEquals(0, CheckResult(num_items, d_data, h_data, test)); } @@ -272,8 +295,10 @@ void Test(unsigned int num_items, template < typename DataType, unsigned int ItemsPerThread, - unsigned int ThreadsInBlock, - template class TestType> + unsigned int BlockDimX, + unsigned int BlockDimY, + unsigned int BlockDimZ, + template class TestType> struct SingleItemTestHelper { static void run(unsigned int /* num_items */, @@ -285,71 +310,71 @@ struct SingleItemTestHelper template < typename DataType, - unsigned int ThreadsInBlock, - template class TestType> -struct SingleItemTestHelper + unsigned int BlockDimX, + unsigned int BlockDimY, + unsigned int BlockDimZ, + template class TestType> +struct SingleItemTestHelper { static void run(unsigned int num_items, DataType *d_data, DataType *h_data) { - TestType test; + TestType test; Iota(num_items, d_data); - BlockShuffleTest(d_data, test); + BlockShuffleTest(d_data, test); AssertEquals(0, CheckResult(num_items, d_data, h_data, test)); } }; - -template < - typename DataType, - unsigned int ItemsPerThread, - unsigned int ThreadsInBlock> -void Test( - CachingDeviceAllocator &g_allocator -) +template +void Test(CachingDeviceAllocator &g_allocator) { - const unsigned int num_items = ItemsPerThread * ThreadsInBlock; + const unsigned int num_items = BlockDimX * BlockDimY * BlockDimZ * ItemsPerThread; - DataType *d_data = nullptr; - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(DataType) * num_items)); + DataType *d_data = nullptr; + CubDebugExit(g_allocator.DeviceAllocate((void **)&d_data, sizeof(DataType) * num_items)); - std::unique_ptr h_data(new DataType[num_items]); + std::unique_ptr h_data(new DataType[num_items]); - Test(num_items, d_data, h_data.get()); - Test(num_items, d_data, h_data.get()); + Test(num_items, + d_data, + h_data.get()); + Test(num_items, + d_data, + h_data.get()); - SingleItemTestHelper().run(num_items, - d_data, - h_data.get()); + SingleItemTestHelper() + .run(num_items, d_data, h_data.get()); - SingleItemTestHelper().run(num_items, - d_data, - h_data.get()); + SingleItemTestHelper() + .run(num_items, d_data, h_data.get()); - SingleItemTestHelper().run(num_items, - d_data, - h_data.get()); + SingleItemTestHelper() + .run(num_items, d_data, h_data.get()); - SingleItemTestHelper().run(num_items, - d_data, - h_data.get()); + SingleItemTestHelper() + .run(num_items, d_data, h_data.get()); - if (d_data) - { - CubDebugExit(g_allocator.DeviceFree(d_data)); - } + if (d_data) + { + CubDebugExit(g_allocator.DeviceFree(d_data)); + } } -template +template void Test(CachingDeviceAllocator &g_allocator) { - Test(g_allocator); - Test(g_allocator); - Test(g_allocator); - Test(g_allocator); - Test(g_allocator); + Test(g_allocator); + Test(g_allocator); + Test(g_allocator); + Test(g_allocator); + Test(g_allocator); } int main(int argc, char** argv) @@ -365,5 +390,7 @@ int main(int argc, char** argv) Test<2> (g_allocator); Test<15> (g_allocator); + Test(g_allocator); + return 0; }