Skip to content
This repository was archived by the owner on Apr 3, 2024. It is now read-only.

Commit 63fc662

Browse files
authored
Merge pull request NVIDIA#675 from mfbalin/fancy_device_memcpy
Initial implementation for DeviceCopy::Batched
2 parents 76471de + 5523b9d commit 63fc662

File tree

6 files changed

+874
-62
lines changed

6 files changed

+874
-62
lines changed

cub/agent/agent_batch_memcpy.cuh

+106-21
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
#include <cub/util_ptx.cuh>
4343
#include <cub/util_type.cuh>
4444

45+
#include <cuda/std/type_traits>
46+
4547
#include <cstdint>
4648

4749
CUB_NAMESPACE_BEGIN
@@ -287,6 +289,82 @@ VectorizedCopy(int32_t thread_rank, void *dest, ByteOffsetT num_bytes, const voi
287289
}
288290
}
289291

292+
template <bool IsMemcpy,
293+
uint32_t LOGICAL_WARP_SIZE,
294+
typename InputBufferT,
295+
typename OutputBufferT,
296+
typename OffsetT,
297+
typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0>
298+
__device__ __forceinline__ void copy_items(InputBufferT input_buffer,
299+
OutputBufferT output_buffer,
300+
OffsetT num_bytes,
301+
OffsetT offset = 0)
302+
{
303+
VectorizedCopy<LOGICAL_WARP_SIZE, uint4>(threadIdx.x % LOGICAL_WARP_SIZE,
304+
&reinterpret_cast<char *>(output_buffer)[offset],
305+
num_bytes,
306+
&reinterpret_cast<const char *>(input_buffer)[offset]);
307+
}
308+
309+
template <bool IsMemcpy,
310+
uint32_t LOGICAL_WARP_SIZE,
311+
typename InputBufferT,
312+
typename OutputBufferT,
313+
typename OffsetT,
314+
typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0>
315+
__device__ __forceinline__ void copy_items(InputBufferT input_buffer,
316+
OutputBufferT output_buffer,
317+
OffsetT num_items,
318+
OffsetT offset = 0)
319+
{
320+
output_buffer += offset;
321+
input_buffer += offset;
322+
for (OffsetT i = threadIdx.x % LOGICAL_WARP_SIZE; i < num_items; i += LOGICAL_WARP_SIZE)
323+
{
324+
*(output_buffer + i) = *(input_buffer + i);
325+
}
326+
}
327+
328+
template <bool IsMemcpy,
329+
typename AliasT,
330+
typename InputIt,
331+
typename OffsetT,
332+
typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0>
333+
__device__ __forceinline__ AliasT read_item(InputIt buffer_src, OffsetT offset)
334+
{
335+
return *(reinterpret_cast<const AliasT *>(buffer_src) + offset);
336+
}
337+
338+
template <bool IsMemcpy,
339+
typename AliasT,
340+
typename InputIt,
341+
typename OffsetT,
342+
typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0>
343+
__device__ __forceinline__ AliasT read_item(InputIt buffer_src, OffsetT offset)
344+
{
345+
return *(buffer_src + offset);
346+
}
347+
348+
template <bool IsMemcpy,
349+
typename AliasT,
350+
typename OutputIt,
351+
typename OffsetT,
352+
typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0>
353+
__device__ __forceinline__ void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value)
354+
{
355+
*(reinterpret_cast<AliasT *>(buffer_dst) + offset) = value;
356+
}
357+
358+
template <bool IsMemcpy,
359+
typename AliasT,
360+
typename OutputIt,
361+
typename OffsetT,
362+
typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0>
363+
__device__ __forceinline__ void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value)
364+
{
365+
*(buffer_dst + offset) = value;
366+
}
367+
290368
/**
291369
* @brief A helper class that allows threads to maintain multiple counters, where the counter that
292370
* shall be incremented can be addressed dynamically without incurring register spillage.
@@ -431,7 +509,8 @@ template <typename AgentMemcpySmallBuffersPolicyT,
431509
typename BlevBufferTileOffsetsOutItT,
432510
typename BlockOffsetT,
433511
typename BLevBufferOffsetTileState,
434-
typename BLevBlockOffsetTileState>
512+
typename BLevBlockOffsetTileState,
513+
bool IsMemcpy>
435514
class AgentBatchMemcpy
436515
{
437516
private:
@@ -470,7 +549,14 @@ private:
470549
// TYPE DECLARATIONS
471550
//---------------------------------------------------------------------
472551
/// Internal load/store type. For byte-wise memcpy, a single-byte type
473-
using AliasT = char;
552+
using AliasT = typename ::cuda::std::conditional<
553+
IsMemcpy,
554+
std::iterator_traits<char *>,
555+
std::iterator_traits<cub::detail::value_t<InputBufferIt>>>::type::value_type;
556+
557+
/// Types of the input and output buffers
558+
using InputBufferT = cub::detail::value_t<InputBufferIt>;
559+
using OutputBufferT = cub::detail::value_t<OutputBufferIt>;
474560

475561
/// Type that has to be sufficiently large to hold any of the buffers' sizes.
476562
/// The BufferSizeIteratorT's value type must be convertible to this type.
@@ -775,17 +861,16 @@ private:
775861
BlockBufferOffsetT num_wlev_buffers)
776862
{
777863
const int32_t warp_id = threadIdx.x / CUB_PTX_WARP_THREADS;
778-
const int32_t warp_lane = threadIdx.x % CUB_PTX_WARP_THREADS;
779864
constexpr uint32_t WARPS_PER_BLOCK = BLOCK_THREADS / CUB_PTX_WARP_THREADS;
780865

781866
for (BlockBufferOffsetT buffer_offset = warp_id; buffer_offset < num_wlev_buffers;
782867
buffer_offset += WARPS_PER_BLOCK)
783868
{
784869
const auto buffer_id = buffers_by_size_class[buffer_offset].buffer_id;
785-
detail::VectorizedCopy<CUB_PTX_WARP_THREADS, uint4>(warp_lane,
786-
tile_buffer_dsts[buffer_id],
787-
tile_buffer_sizes[buffer_id],
788-
tile_buffer_srcs[buffer_id]);
870+
copy_items<IsMemcpy, CUB_PTX_WARP_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
871+
tile_buffer_srcs[buffer_id],
872+
tile_buffer_dsts[buffer_id],
873+
tile_buffer_sizes[buffer_id]);
789874
}
790875
}
791876

@@ -875,18 +960,18 @@ private:
875960
#pragma unroll
876961
for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
877962
{
878-
src_byte[i] = reinterpret_cast<const AliasT *>(
879-
tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id])[zipped_byte_assignment[i]
880-
.buffer_byte_offset];
963+
src_byte[i] = read_item<IsMemcpy, AliasT, InputBufferT>(
964+
tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id],
965+
zipped_byte_assignment[i].buffer_byte_offset);
881966
absolute_tlev_byte_offset += BLOCK_THREADS;
882967
}
883968
#pragma unroll
884969
for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
885970
{
886-
reinterpret_cast<AliasT *>(
887-
tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id])[zipped_byte_assignment[i]
888-
.buffer_byte_offset] =
889-
src_byte[i];
971+
write_item<IsMemcpy, AliasT, OutputBufferT>(
972+
tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id],
973+
zipped_byte_assignment[i].buffer_byte_offset,
974+
src_byte[i]);
890975
}
891976
}
892977
else
@@ -897,13 +982,13 @@ private:
897982
{
898983
if (absolute_tlev_byte_offset < num_total_tlev_bytes)
899984
{
900-
const AliasT src_byte = reinterpret_cast<const AliasT *>(
901-
tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id])[zipped_byte_assignment[i]
902-
.buffer_byte_offset];
903-
reinterpret_cast<AliasT *>(
904-
tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id])[zipped_byte_assignment[i]
905-
.buffer_byte_offset] =
906-
src_byte;
985+
const AliasT src_byte = read_item<IsMemcpy, AliasT, InputBufferT>(
986+
tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id],
987+
zipped_byte_assignment[i].buffer_byte_offset);
988+
write_item<IsMemcpy, AliasT, OutputBufferT>(
989+
tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id],
990+
zipped_byte_assignment[i].buffer_byte_offset,
991+
src_byte);
907992
}
908993
absolute_tlev_byte_offset += BLOCK_THREADS;
909994
}

cub/cub.cuh

+1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252

5353
// Device
5454
#include "device/device_adjacent_difference.cuh"
55+
#include "device/device_copy.cuh"
5556
#include "device/device_histogram.cuh"
5657
#include "device/device_memcpy.cuh"
5758
#include "device/device_merge_sort.cuh"

cub/device/device_copy.cuh

+172
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
/******************************************************************************
2+
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Redistribution and use in source and binary forms, with or without
5+
* modification, are permitted provided that the following conditions are met:
6+
* * Redistributions of source code must retain the above copyright
7+
* notice, this list of conditions and the following disclaimer.
8+
* * Redistributions in binary form must reproduce the above copyright
9+
* notice, this list of conditions and the following disclaimer in the
10+
* documentation and/or other materials provided with the distribution.
11+
* * Neither the name of the NVIDIA CORPORATION nor the
12+
* names of its contributors may be used to endorse or promote products
13+
* derived from this software without specific prior written permission.
14+
*
15+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16+
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17+
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18+
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
*
26+
******************************************************************************/
27+
28+
/**
29+
* \file
30+
* cub::DeviceCopy provides device-wide, parallel operations for copying data.
31+
*/
32+
33+
#pragma once
34+
35+
#include <cub/config.cuh>
36+
#include <cub/device/dispatch/dispatch_batch_memcpy.cuh>
37+
38+
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
39+
40+
#include <cstdint>
41+
42+
CUB_NAMESPACE_BEGIN
43+
44+
/**
45+
* @brief cub::DeviceCopy provides device-wide, parallel operations for copying data.
46+
* \ingroup SingleModule
47+
*/
48+
struct DeviceCopy
49+
{
50+
/**
51+
* @brief Copies data from a batch of given source ranges to their corresponding destination
52+
* ranges.
53+
* @note If any input range aliases any output range the behavior is undefined. If
54+
* any output range aliases another output range the behavior is undefined. Input
55+
* ranges can alias one another.
56+
*
57+
* @par Snippet
58+
* The code snippet below illustrates usage of DeviceCopy::Batched to perform a DeviceRunLength
59+
* Decode operation.
60+
* @par
61+
* @code
62+
* struct GetIteratorToRange
63+
* {
64+
* __host__ __device__ __forceinline__ auto operator()(uint32_t index)
65+
* {
66+
* return thrust::make_constant_iterator(d_data_in[index]);
67+
* }
68+
* int32_t *d_data_in;
69+
* };
70+
*
71+
* struct GetPtrToRange
72+
* {
73+
* __host__ __device__ __forceinline__ auto operator()(uint32_t index)
74+
* {
75+
* return d_data_out + d_offsets[index];
76+
* }
77+
* int32_t *d_data_out;
78+
* uint32_t *d_offsets;
79+
* };
80+
*
81+
* struct GetRunLength
82+
* {
83+
* __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
84+
* {
85+
* return d_offsets[index + 1] - d_offsets[index];
86+
* }
87+
* uint32_t *d_offsets;
88+
* };
89+
*
90+
* uint32_t num_ranges = 5;
91+
* int32_t *d_data_in; // e.g., [4, 2, 7, 3, 1]
92+
* int32_t *d_data_out; // e.g., [0, ... ]
93+
* uint32_t *d_offsets; // e.g., [0, 2, 5, 6, 9, 14]
94+
*
95+
* // Returns a constant iterator to the element of the i-th run
96+
* thrust::counting_iterator<uint32_t> iota(0);
97+
* auto iterators_in = thrust::make_transform_iterator(iota, GetIteratorToRange{d_data_in});
98+
*
99+
* // Returns the run length of the i-th run
100+
* auto sizes = thrust::make_transform_iterator(iota, GetRunLength{d_offsets});
101+
*
102+
* // Returns pointers to the output range for each run
103+
* auto ptrs_out = thrust::make_transform_iterator(iota, GetPtrToRange{d_data_out, d_offsets});
104+
*
105+
* // Determine temporary device storage requirements
106+
* void *d_temp_storage = nullptr;
107+
* size_t temp_storage_bytes = 0;
108+
* cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
109+
* num_ranges);
110+
*
111+
* // Allocate temporary storage
112+
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
113+
*
114+
* // Run batched copy algorithm (used to perform runlength decoding)
115+
* cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
116+
* num_ranges);
117+
*
118+
* // d_data_out <-- [4, 4, 2, 2, 2, 7, 3, 3, 3, 1, 1, 1, 1, 1]
119+
* @endcode
120+
* @tparam InputIt <b>[inferred]</b> Device-accessible random-access input iterator type
121+
* providing the iterators to the source ranges
122+
* @tparam OutputIt <b>[inferred]</b> Device-accessible random-access input iterator type
123+
* providing the iterators to the destination ranges
124+
* @tparam SizeIteratorT <b>[inferred]</b> Device-accessible random-access input iterator
125+
* type providing the number of items to be copied for each pair of ranges
126+
* @param d_temp_storage [in] Device-accessible allocation of temporary storage. When NULL, the
127+
* required allocation size is written to \p temp_storage_bytes and no work is done.
128+
* @param temp_storage_bytes [in,out] Reference to size in bytes of \p d_temp_storage allocation
129+
* @param input_it [in] Device-accessible iterator providing the iterators to the source
130+
* ranges
131+
* @param output_it [in] Device-accessible iterator providing the iterators to the
132+
* destination ranges
133+
* @param sizes [in] Device-accessible iterator providing the number of elements to be copied
134+
* for each pair of ranges
135+
* @param num_ranges [in] The total number of range pairs
136+
* @param stream [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is
137+
* stream<sub>0</sub>.
138+
*/
139+
template <typename InputIt, typename OutputIt, typename SizeIteratorT>
140+
CUB_RUNTIME_FUNCTION static cudaError_t Batched(void *d_temp_storage,
141+
size_t &temp_storage_bytes,
142+
InputIt input_it,
143+
OutputIt output_it,
144+
SizeIteratorT sizes,
145+
uint32_t num_ranges,
146+
cudaStream_t stream = 0)
147+
{
148+
// Integer type large enough to hold any offset in [0, num_ranges)
149+
using RangeOffsetT = uint32_t;
150+
151+
// Integer type large enough to hold any offset in [0, num_thread_blocks_launched), where a safe
152+
// uppper bound on num_thread_blocks_launched can be assumed to be given by
153+
// IDIV_CEIL(num_ranges, 64)
154+
using BlockOffsetT = uint32_t;
155+
156+
return detail::DispatchBatchMemcpy<InputIt,
157+
OutputIt,
158+
SizeIteratorT,
159+
RangeOffsetT,
160+
BlockOffsetT,
161+
detail::DeviceBatchMemcpyPolicy,
162+
false>::Dispatch(d_temp_storage,
163+
temp_storage_bytes,
164+
input_it,
165+
output_it,
166+
sizes,
167+
num_ranges,
168+
stream);
169+
}
170+
};
171+
172+
CUB_NAMESPACE_END

0 commit comments

Comments
 (0)