diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh index d49da3e01b..77e0d9c1f4 100644 --- a/cpp/src/centrality/betweenness_centrality_impl.cuh +++ b/cpp/src/centrality/betweenness_centrality_impl.cuh @@ -736,8 +736,9 @@ void multisource_backward_pass( auto d_first = thrust::make_transform_iterator( distances_2d.begin(), - cuda::proclaim_return_type( - [invalid_distance] __device__(auto d) { return d == invalid_distance ? vertex_t{0} : d; })); + cuda::proclaim_return_type([invalid_distance] __device__(vertex_t d) { + return d == invalid_distance ? vertex_t{0} : d; + })); vertex_t global_max_distance = thrust::reduce(handle.get_thrust_policy(), d_first, d_first + distances_2d.size(), @@ -795,18 +796,21 @@ void multisource_backward_pass( auto v_first = graph_view.local_vertex_partition_range_first(); // Calculate offsets for each distance level in the consecutive arrays - std::vector h_distance_offsets(global_max_distance + 1); + // Need global_max_distance + 2 elements: one for each distance level (0 to global_max_distance) + // plus a sentinel at the end for CUB segmented sort end offsets + std::vector h_distance_offsets(global_max_distance + 2); size_t offset = 0; for (vertex_t d = 0; d <= global_max_distance; ++d) { h_distance_offsets[d] = offset; offset += host_distance_counts[d]; } + h_distance_offsets[global_max_distance + 1] = offset; // sentinel = total_vertices // Copy offsets to device for kernel access - rmm::device_uvector d_distance_offsets(global_max_distance + 1, handle.get_stream()); + rmm::device_uvector d_distance_offsets(global_max_distance + 2, handle.get_stream()); raft::update_device(d_distance_offsets.data(), h_distance_offsets.data(), - global_max_distance + 1, + global_max_distance + 2, handle.get_stream()); // Populate consecutive arrays - single scan of distance array @@ -873,7 +877,11 @@ void multisource_backward_pass( // Allocate temporary storage for CUB segmented sort rmm::device_uvector d_tmp_storage(0, handle.get_stream()); - // Process each chunk - sort consecutive arrays directly in-place + // Allocate output buffers for CUB sort (input/output cannot overlap) + rmm::device_uvector sorted_vertices(total_vertices, handle.get_stream()); + rmm::device_uvector sorted_sources(total_vertices, handle.get_stream()); + + // Process each chunk for (size_t chunk_i = 0; chunk_i < num_chunks; ++chunk_i) { size_t chunk_vertex_start = h_vertex_chunk_offsets[chunk_i]; size_t chunk_vertex_end = h_vertex_chunk_offsets[chunk_i + 1]; @@ -884,17 +892,19 @@ void multisource_backward_pass( if (num_segments_in_chunk > 0) { auto offset_first = thrust::make_transform_iterator( - h_distance_offsets.data() + chunk_distance_start, - [chunk_vertex_start] __device__(size_t offset) { return offset - chunk_vertex_start; }); + d_distance_offsets.data() + chunk_distance_start, + cuda::proclaim_return_type([chunk_vertex_start] __device__(size_t offset) { + return offset - chunk_vertex_start; + })); - // CUB segmented sort directly on consecutive arrays - no copy needed! + // CUB segmented sort requires separate input and output buffers size_t temp_storage_bytes = 0; cub::DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, all_vertices.data() + chunk_vertex_start, - all_vertices.data() + chunk_vertex_start, - all_sources.data() + chunk_vertex_start, + sorted_vertices.data() + chunk_vertex_start, all_sources.data() + chunk_vertex_start, + sorted_sources.data() + chunk_vertex_start, chunk_size, num_segments_in_chunk, offset_first, @@ -908,9 +918,9 @@ void multisource_backward_pass( cub::DeviceSegmentedSort::SortPairs(d_tmp_storage.data(), temp_storage_bytes, all_vertices.data() + chunk_vertex_start, - all_vertices.data() + chunk_vertex_start, - all_sources.data() + chunk_vertex_start, + sorted_vertices.data() + chunk_vertex_start, all_sources.data() + chunk_vertex_start, + sorted_sources.data() + chunk_vertex_start, chunk_size, num_segments_in_chunk, offset_first, @@ -918,6 +928,10 @@ void multisource_backward_pass( handle.get_stream()); } } + + // Use the sorted arrays for subsequent processing + all_vertices = std::move(sorted_vertices); + all_sources = std::move(sorted_sources); } // Process distance levels using pre-computed buckets (now with sorted vertices) diff --git a/cpp/src/sampling/negative_sampling_impl.cuh b/cpp/src/sampling/negative_sampling_impl.cuh index d9f5aa827d..861a8fff6a 100644 --- a/cpp/src/sampling/negative_sampling_impl.cuh +++ b/cpp/src/sampling/negative_sampling_impl.cuh @@ -92,7 +92,8 @@ normalize_biases(raft::handle_t const& handle, thrust::find_if(handle.get_thrust_policy(), thrust::make_reverse_iterator(gpu_biases->end()), thrust::make_reverse_iterator(gpu_biases->begin()), - [] __device__(weight_t bias) { return bias > weight_t{0}; })); + cuda::proclaim_return_type( + [] __device__(weight_t bias) { return bias > weight_t{0}; }))); thrust::transform(handle.get_thrust_policy(), gpu_biases->begin(), diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index 68936974b9..549b6887bc 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -273,8 +273,12 @@ __global__ static void multi_partition_copy( if (tmp_idx < num_elems) { auto partition = partition_op(*(input_first + tmp_idx)); tmp_partitions[i] = partition; - tmp_offsets[i] = tmp_counts[partition]; - ++tmp_counts[partition]; + // Skip count update for discarded elements (partition == max_num_partitions) + // to avoid out-of-bounds access on tmp_counts array + if (partition != static_cast(max_num_partitions)) { + tmp_offsets[i] = tmp_counts[partition]; + ++tmp_counts[partition]; + } } tmp_idx += gridDim.x * blockDim.x; }