Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #511 from senior-zero/fix-main/github/scan_interme…
Browse files Browse the repository at this point in the history
…diate_type

P2322R6 accumulator types for scan and reduce by key
  • Loading branch information
gevtushenko authored Aug 3, 2022
2 parents ed0491a + 3dbde5b commit e882acc
Show file tree
Hide file tree
Showing 19 changed files with 3,772 additions and 2,595 deletions.
1,054 changes: 610 additions & 444 deletions cub/agent/agent_reduce_by_key.cuh

Large diffs are not rendered by default.

901 changes: 494 additions & 407 deletions cub/agent/agent_scan.cuh

Large diffs are not rendered by default.

799 changes: 414 additions & 385 deletions cub/agent/agent_scan_by_key.cuh

Large diffs are not rendered by default.

24 changes: 15 additions & 9 deletions cub/agent/single_pass_scan_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,12 @@

#include <iterator>

#include "../thread/thread_load.cuh"
#include "../thread/thread_store.cuh"
#include "../warp/warp_reduce.cuh"
#include "../config.cuh"
#include "../util_device.cuh"
#include <cub/config.cuh>
#include <cub/detail/uninitialized_copy.cuh>
#include <cub/thread/thread_load.cuh>
#include <cub/thread/thread_store.cuh>
#include <cub/util_device.cuh>
#include <cub/warp/warp_reduce.cuh>

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -738,8 +739,10 @@ struct TilePrefixCallbackOp
// Update our status with our tile-aggregate
if (threadIdx.x == 0)
{
temp_storage.block_aggregate = block_aggregate;
tile_status.SetPartial(tile_idx, block_aggregate);
detail::uninitialized_copy(&temp_storage.block_aggregate,
block_aggregate);

tile_status.SetPartial(tile_idx, block_aggregate);
}

int predecessor_idx = tile_idx - threadIdx.x - 1;
Expand Down Expand Up @@ -768,8 +771,11 @@ struct TilePrefixCallbackOp
inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
tile_status.SetInclusive(tile_idx, inclusive_prefix);

temp_storage.exclusive_prefix = exclusive_prefix;
temp_storage.inclusive_prefix = inclusive_prefix;
detail::uninitialized_copy(&temp_storage.exclusive_prefix,
exclusive_prefix);

detail::uninitialized_copy(&temp_storage.inclusive_prefix,
inclusive_prefix);
}

// Return exclusive_prefix
Expand Down
52 changes: 34 additions & 18 deletions cub/block/block_exchange.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@

#pragma once

#include "../config.cuh"
#include "../util_ptx.cuh"
#include "../util_type.cuh"
#include "../warp/warp_exchange.cuh"
#include <cub/config.cuh>
#include <cub/detail/uninitialized_copy.cuh>
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>
#include <cub/warp/warp_exchange.cuh>

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -209,7 +210,8 @@ private:
{
int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}

CTA_SYNC();
Expand Down Expand Up @@ -250,7 +252,8 @@ private:
{
int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}
}

Expand Down Expand Up @@ -298,7 +301,8 @@ private:
{
int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}

WARP_SYNC(0xffffffff);
Expand Down Expand Up @@ -328,7 +332,8 @@ private:
{
int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}

WARP_SYNC(0xffffffff);
Expand All @@ -354,7 +359,8 @@ private:
{
int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}

WARP_SYNC(0xffffffff);
Expand Down Expand Up @@ -385,7 +391,8 @@ private:
{
int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}

CTA_SYNC();
Expand Down Expand Up @@ -434,7 +441,9 @@ private:
if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
{
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff +
item_offset,
input_items[ITEM]);
}
}
}
Expand Down Expand Up @@ -476,7 +485,8 @@ private:
{
int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
new (&temp_storage.buff[item_offset]) InputT (input_items[ITEM]);
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}

WARP_SYNC(0xffffffff);
Expand All @@ -486,7 +496,8 @@ private:
{
int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
new(&output_items[ITEM]) OutputT(temp_storage.buff[item_offset]);
detail::uninitialized_copy(output_items + ITEM,
temp_storage.buff[item_offset]);
}
}

Expand All @@ -512,7 +523,8 @@ private:
{
int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}

WARP_SYNC(0xffffffff);
Expand Down Expand Up @@ -544,7 +556,8 @@ private:
{
int item_offset = ranks[ITEM];
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}

CTA_SYNC();
Expand Down Expand Up @@ -584,7 +597,8 @@ private:
if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
{
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}
}

Expand Down Expand Up @@ -626,7 +640,8 @@ private:
{
int item_offset = ranks[ITEM];
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}

CTA_SYNC();
Expand Down Expand Up @@ -668,7 +683,8 @@ private:
if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
{
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
temp_storage.buff[item_offset] = input_items[ITEM];
detail::uninitialized_copy(temp_storage.buff + item_offset,
input_items[ITEM]);
}
}

Expand Down
10 changes: 6 additions & 4 deletions cub/block/specializations/block_reduce_warp_reductions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@

#pragma once

#include "../../warp/warp_reduce.cuh"
#include "../../config.cuh"
#include "../../util_ptx.cuh"
#include <cub/config.cuh>
#include <cub/detail/uninitialized_copy.cuh>
#include <cub/util_ptx.cuh>
#include <cub/warp/warp_reduce.cuh>

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -143,7 +144,8 @@ struct BlockReduceWarpReductions
// Share lane aggregates
if (lane_id == 0)
{
new (temp_storage.warp_aggregates + warp_id) T(warp_aggregate);
detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id,
warp_aggregate);
}

CTA_SYNC();
Expand Down
23 changes: 15 additions & 8 deletions cub/block/specializations/block_scan_warp_scans.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@

#pragma once

#include "../../config.cuh"
#include "../../util_ptx.cuh"
#include "../../warp/warp_scan.cuh"
#include <cub/config.cuh>
#include <cub/detail/uninitialized_copy.cuh>
#include <cub/util_ptx.cuh>
#include <cub/warp/warp_scan.cuh>

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -151,7 +152,10 @@ struct BlockScanWarpScans
{
// Last lane in each warp shares its warp-aggregate
if (lane_id == WARP_THREADS - 1)
temp_storage.warp_aggregates[warp_id] = warp_aggregate;
{
detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id,
warp_aggregate);
}

CTA_SYNC();

Expand Down Expand Up @@ -293,9 +297,11 @@ struct BlockScanWarpScans
T block_prefix = block_prefix_callback_op(block_aggregate);
if (lane_id == 0)
{
// Share the prefix with all threads
temp_storage.block_prefix = block_prefix;
exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0
// Share the prefix with all threads
detail::uninitialized_copy(&temp_storage.block_prefix,
block_prefix);

exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0
}
}

Expand Down Expand Up @@ -367,7 +373,8 @@ struct BlockScanWarpScans
if (lane_id == 0)
{
// Share the prefix with all threads
temp_storage.block_prefix = block_prefix;
detail::uninitialized_copy(&temp_storage.block_prefix,
block_prefix);
}
}

Expand Down
66 changes: 66 additions & 0 deletions cub/detail/uninitialized_copy.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/******************************************************************************
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#pragma once

#include <cub/config.cuh>

#include <cuda/std/type_traits>

CUB_NAMESPACE_BEGIN


namespace detail
{

template <typename T,
typename U,
typename cuda::std::enable_if<
cuda::std::is_trivially_copyable<T>::value,
int
>::type = 0>
__host__ __device__ void uninitialized_copy(T *ptr, U &&val)
{
*ptr = cuda::std::forward<U>(val);
}

template <typename T,
typename U,
typename cuda::std::enable_if<
!cuda::std::is_trivially_copyable<T>::value,
int
>::type = 0>
__host__ __device__ void uninitialized_copy(T *ptr, U &&val)
{
new (ptr) T(cuda::std::forward<U>(val));
}

} // namespace detail


CUB_NAMESPACE_END

Loading

0 comments on commit e882acc

Please sign in to comment.