From 7db7b728107b3ffe33822054d4d03a76eec06667 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 16 Dec 2025 16:24:24 -0600 Subject: [PATCH 1/5] Proclaim return types --- cpp/src/centrality/betweenness_centrality_impl.cuh | 10 +++++++--- cpp/src/sampling/negative_sampling_impl.cuh | 3 ++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh index d49da3e01b..eec62b50e3 100644 --- a/cpp/src/centrality/betweenness_centrality_impl.cuh +++ b/cpp/src/centrality/betweenness_centrality_impl.cuh @@ -736,8 +736,9 @@ void multisource_backward_pass( auto d_first = thrust::make_transform_iterator( distances_2d.begin(), - cuda::proclaim_return_type( - [invalid_distance] __device__(auto d) { return d == invalid_distance ? vertex_t{0} : d; })); + cuda::proclaim_return_type([invalid_distance] __host__ __device__(vertex_t d) { + return d == invalid_distance ? vertex_t{0} : d; + })); vertex_t global_max_distance = thrust::reduce(handle.get_thrust_policy(), d_first, d_first + distances_2d.size(), @@ -885,7 +886,10 @@ void multisource_backward_pass( if (num_segments_in_chunk > 0) { auto offset_first = thrust::make_transform_iterator( h_distance_offsets.data() + chunk_distance_start, - [chunk_vertex_start] __device__(size_t offset) { return offset - chunk_vertex_start; }); + cuda::proclaim_return_type( + [chunk_vertex_start] __host__ __device__(size_t offset) { + return offset - chunk_vertex_start; + })); // CUB segmented sort directly on consecutive arrays - no copy needed! size_t temp_storage_bytes = 0; diff --git a/cpp/src/sampling/negative_sampling_impl.cuh b/cpp/src/sampling/negative_sampling_impl.cuh index d9f5aa827d..861a8fff6a 100644 --- a/cpp/src/sampling/negative_sampling_impl.cuh +++ b/cpp/src/sampling/negative_sampling_impl.cuh @@ -92,7 +92,8 @@ normalize_biases(raft::handle_t const& handle, thrust::find_if(handle.get_thrust_policy(), thrust::make_reverse_iterator(gpu_biases->end()), thrust::make_reverse_iterator(gpu_biases->begin()), - [] __device__(weight_t bias) { return bias > weight_t{0}; })); + cuda::proclaim_return_type( + [] __device__(weight_t bias) { return bias > weight_t{0}; }))); thrust::transform(handle.get_thrust_policy(), gpu_biases->begin(), From 657504e83ea147ec9b9418577ce7b7e116a63046 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 17 Dec 2025 21:28:27 -0600 Subject: [PATCH 2/5] Try removing __host__ from device lambdas --- cpp/src/centrality/betweenness_centrality_impl.cuh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh index eec62b50e3..8dd6f5c027 100644 --- a/cpp/src/centrality/betweenness_centrality_impl.cuh +++ b/cpp/src/centrality/betweenness_centrality_impl.cuh @@ -736,7 +736,7 @@ void multisource_backward_pass( auto d_first = thrust::make_transform_iterator( distances_2d.begin(), - cuda::proclaim_return_type([invalid_distance] __host__ __device__(vertex_t d) { + cuda::proclaim_return_type([invalid_distance] __device__(vertex_t d) { return d == invalid_distance ? vertex_t{0} : d; })); vertex_t global_max_distance = thrust::reduce(handle.get_thrust_policy(), @@ -885,11 +885,10 @@ void multisource_backward_pass( if (num_segments_in_chunk > 0) { auto offset_first = thrust::make_transform_iterator( - h_distance_offsets.data() + chunk_distance_start, - cuda::proclaim_return_type( - [chunk_vertex_start] __host__ __device__(size_t offset) { - return offset - chunk_vertex_start; - })); + d_distance_offsets.data() + chunk_distance_start, + cuda::proclaim_return_type([chunk_vertex_start] __device__(size_t offset) { + return offset - chunk_vertex_start; + })); // CUB segmented sort directly on consecutive arrays - no copy needed! size_t temp_storage_bytes = 0; From 17dbd5a4a5d582a9f13fa5e52c728b0094d9af76 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 18 Dec 2025 00:28:21 -0600 Subject: [PATCH 3/5] Fix betweenness centrality --- .../betweenness_centrality_impl.cuh | 42 +++++++++++++++---- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh index 8dd6f5c027..db22f794c8 100644 --- a/cpp/src/centrality/betweenness_centrality_impl.cuh +++ b/cpp/src/centrality/betweenness_centrality_impl.cuh @@ -796,18 +796,21 @@ void multisource_backward_pass( auto v_first = graph_view.local_vertex_partition_range_first(); // Calculate offsets for each distance level in the consecutive arrays - std::vector h_distance_offsets(global_max_distance + 1); + // Need global_max_distance + 2 elements: one for each distance level (0 to global_max_distance) + // plus a sentinel at the end for CUB segmented sort end offsets + std::vector h_distance_offsets(global_max_distance + 2); size_t offset = 0; for (vertex_t d = 0; d <= global_max_distance; ++d) { h_distance_offsets[d] = offset; offset += host_distance_counts[d]; } + h_distance_offsets[global_max_distance + 1] = offset; // sentinel = total_vertices // Copy offsets to device for kernel access - rmm::device_uvector d_distance_offsets(global_max_distance + 1, handle.get_stream()); + rmm::device_uvector d_distance_offsets(global_max_distance + 2, handle.get_stream()); raft::update_device(d_distance_offsets.data(), h_distance_offsets.data(), - global_max_distance + 1, + global_max_distance + 2, handle.get_stream()); // Populate consecutive arrays - single scan of distance array @@ -874,7 +877,18 @@ void multisource_backward_pass( // Allocate temporary storage for CUB segmented sort rmm::device_uvector d_tmp_storage(0, handle.get_stream()); - // Process each chunk - sort consecutive arrays directly in-place + // Find max chunk size to allocate output buffers once + size_t max_chunk_size = 0; + for (size_t chunk_i = 0; chunk_i < num_chunks; ++chunk_i) { + size_t chunk_size = h_vertex_chunk_offsets[chunk_i + 1] - h_vertex_chunk_offsets[chunk_i]; + max_chunk_size = std::max(max_chunk_size, chunk_size); + } + + // Allocate output buffers for CUB sort (requires separate input/output buffers) + rmm::device_uvector d_sorted_vertices(max_chunk_size, handle.get_stream()); + rmm::device_uvector d_sorted_sources(max_chunk_size, handle.get_stream()); + + // Process each chunk for (size_t chunk_i = 0; chunk_i < num_chunks; ++chunk_i) { size_t chunk_vertex_start = h_vertex_chunk_offsets[chunk_i]; size_t chunk_vertex_end = h_vertex_chunk_offsets[chunk_i + 1]; @@ -890,14 +904,14 @@ void multisource_backward_pass( return offset - chunk_vertex_start; })); - // CUB segmented sort directly on consecutive arrays - no copy needed! + // CUB segmented sort requires separate input and output buffers size_t temp_storage_bytes = 0; cub::DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, all_vertices.data() + chunk_vertex_start, - all_vertices.data() + chunk_vertex_start, - all_sources.data() + chunk_vertex_start, + d_sorted_vertices.data(), all_sources.data() + chunk_vertex_start, + d_sorted_sources.data(), chunk_size, num_segments_in_chunk, offset_first, @@ -911,14 +925,24 @@ void multisource_backward_pass( cub::DeviceSegmentedSort::SortPairs(d_tmp_storage.data(), temp_storage_bytes, all_vertices.data() + chunk_vertex_start, - all_vertices.data() + chunk_vertex_start, - all_sources.data() + chunk_vertex_start, + d_sorted_vertices.data(), all_sources.data() + chunk_vertex_start, + d_sorted_sources.data(), chunk_size, num_segments_in_chunk, offset_first, offset_first + 1, handle.get_stream()); + + // Copy sorted results back to the original arrays + thrust::copy(handle.get_thrust_policy(), + d_sorted_vertices.begin(), + d_sorted_vertices.begin() + chunk_size, + all_vertices.data() + chunk_vertex_start); + thrust::copy(handle.get_thrust_policy(), + d_sorted_sources.begin(), + d_sorted_sources.begin() + chunk_size, + all_sources.data() + chunk_vertex_start); } } } From 946381563af534b0e59c46ab0fb1135468d1f296 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 18 Dec 2025 00:40:31 -0600 Subject: [PATCH 4/5] Simplify sort outputs --- .../betweenness_centrality_impl.cuh | 35 ++++++------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh index db22f794c8..77e0d9c1f4 100644 --- a/cpp/src/centrality/betweenness_centrality_impl.cuh +++ b/cpp/src/centrality/betweenness_centrality_impl.cuh @@ -877,16 +877,9 @@ void multisource_backward_pass( // Allocate temporary storage for CUB segmented sort rmm::device_uvector d_tmp_storage(0, handle.get_stream()); - // Find max chunk size to allocate output buffers once - size_t max_chunk_size = 0; - for (size_t chunk_i = 0; chunk_i < num_chunks; ++chunk_i) { - size_t chunk_size = h_vertex_chunk_offsets[chunk_i + 1] - h_vertex_chunk_offsets[chunk_i]; - max_chunk_size = std::max(max_chunk_size, chunk_size); - } - - // Allocate output buffers for CUB sort (requires separate input/output buffers) - rmm::device_uvector d_sorted_vertices(max_chunk_size, handle.get_stream()); - rmm::device_uvector d_sorted_sources(max_chunk_size, handle.get_stream()); + // Allocate output buffers for CUB sort (input/output cannot overlap) + rmm::device_uvector sorted_vertices(total_vertices, handle.get_stream()); + rmm::device_uvector sorted_sources(total_vertices, handle.get_stream()); // Process each chunk for (size_t chunk_i = 0; chunk_i < num_chunks; ++chunk_i) { @@ -909,9 +902,9 @@ void multisource_backward_pass( cub::DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, all_vertices.data() + chunk_vertex_start, - d_sorted_vertices.data(), + sorted_vertices.data() + chunk_vertex_start, all_sources.data() + chunk_vertex_start, - d_sorted_sources.data(), + sorted_sources.data() + chunk_vertex_start, chunk_size, num_segments_in_chunk, offset_first, @@ -925,26 +918,20 @@ void multisource_backward_pass( cub::DeviceSegmentedSort::SortPairs(d_tmp_storage.data(), temp_storage_bytes, all_vertices.data() + chunk_vertex_start, - d_sorted_vertices.data(), + sorted_vertices.data() + chunk_vertex_start, all_sources.data() + chunk_vertex_start, - d_sorted_sources.data(), + sorted_sources.data() + chunk_vertex_start, chunk_size, num_segments_in_chunk, offset_first, offset_first + 1, handle.get_stream()); - - // Copy sorted results back to the original arrays - thrust::copy(handle.get_thrust_policy(), - d_sorted_vertices.begin(), - d_sorted_vertices.begin() + chunk_size, - all_vertices.data() + chunk_vertex_start); - thrust::copy(handle.get_thrust_policy(), - d_sorted_sources.begin(), - d_sorted_sources.begin() + chunk_size, - all_sources.data() + chunk_vertex_start); } } + + // Use the sorted arrays for subsequent processing + all_vertices = std::move(sorted_vertices); + all_sources = std::move(sorted_sources); } // Process distance levels using pre-computed buckets (now with sorted vertices) From f3d7f0e6b06c71c71b225631c84529031f60c2fa Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 18 Dec 2025 01:22:35 -0600 Subject: [PATCH 5/5] Fix out-of-bounds access in od_shortest_distances_impl.cuh --- cpp/src/traversal/od_shortest_distances_impl.cuh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index 68936974b9..549b6887bc 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -273,8 +273,12 @@ __global__ static void multi_partition_copy( if (tmp_idx < num_elems) { auto partition = partition_op(*(input_first + tmp_idx)); tmp_partitions[i] = partition; - tmp_offsets[i] = tmp_counts[partition]; - ++tmp_counts[partition]; + // Skip count update for discarded elements (partition == max_num_partitions) + // to avoid out-of-bounds access on tmp_counts array + if (partition != static_cast(max_num_partitions)) { + tmp_offsets[i] = tmp_counts[partition]; + ++tmp_counts[partition]; + } } tmp_idx += gridDim.x * blockDim.x; }