diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 5885d154aa5..5d2c942cd0c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -71,7 +71,7 @@ dependencies:
 - sphinx-markdown-tables
 - sphinx<6
 - sphinxcontrib-websupport
-- thriftpy2<=0.5.0
+- thriftpy2!=0.5.0,!=0.5.1
 - ucx-proc=*=gpu
 - ucx-py==0.39.*,>=0.0.0a0
 - wget
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 8e7c2156b93..f8a95169ddd 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -76,7 +76,7 @@ dependencies:
 - sphinx-markdown-tables
 - sphinx<6
 - sphinxcontrib-websupport
-- thriftpy2<=0.5.0
+- thriftpy2!=0.5.0,!=0.5.1
 - ucx-proc=*=gpu
 - ucx-py==0.39.*,>=0.0.0a0
 - wget
diff --git a/conda/recipes/cugraph-service/meta.yaml b/conda/recipes/cugraph-service/meta.yaml
index 79b2837eb53..225f40fe2ec 100644
--- a/conda/recipes/cugraph-service/meta.yaml
+++ b/conda/recipes/cugraph-service/meta.yaml
@@ -32,7 +32,7 @@ outputs:
         - rapids-build-backend>=0.3.1,<0.4.0.dev0
       run:
         - python
-        - thriftpy2 >=0.4.15
+        - thriftpy2 >=0.4.15,!=0.5.0,!=0.5.1
 
   - name: cugraph-service-server
     version: {{ version }}
@@ -65,7 +65,7 @@ outputs:
         - numpy >=1.23,<2.0a0
         - python
         - rapids-dask-dependency ={{ minor_version }}
-        - thriftpy2 >=0.4.15
+        - thriftpy2 >=0.4.15,!=0.5.0,!=0.5.1
         - ucx-py {{ ucx_py_version }}
 
 tests:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 61ec34c3319..441627cabce 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -284,6 +284,9 @@ set(CUGRAPH_SOURCES
     src/community/k_truss_sg_v64_e64.cu
     src/community/k_truss_sg_v32_e32.cu
     src/community/k_truss_sg_v32_e64.cu
+    src/community/k_truss_mg_v64_e64.cu
+    src/community/k_truss_mg_v32_e32.cu
+    src/community/k_truss_mg_v32_e64.cu
     src/lookup/lookup_src_dst_mg_v32_e32.cu
     src/lookup/lookup_src_dst_mg_v32_e64.cu
     src/lookup/lookup_src_dst_mg_v64_e64.cu
diff --git a/cpp/include/cugraph/detail/shuffle_wrappers.hpp b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
index 69d48098a5d..7dffcce298a 100644
--- a/cpp/include/cugraph/detail/shuffle_wrappers.hpp
+++ b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
@@ -53,7 +53,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_id_t>>>
+           std::optional<rmm::device_uvector<edge_type_id_t>>,
+           std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -86,14 +87,15 @@ shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
  * (exclusive) vertex ID.
  *
  * @return Tuple of vectors storing shuffled major vertices, minor vertices and optional weights,
- * edge ids and edge types
+ * edge ids and edge types and rx counts
  */
 template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_id_t>
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_id_t>>>
+           std::optional<rmm::device_uvector<edge_type_id_t>>,
+           std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
diff --git a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
index 63d7fd9685e..61ad833a529 100644
--- a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
+++ b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
@@ -251,7 +251,8 @@ class per_device_edgelist_t {
              store_transposed ? src_[0] : dst_[0],
              tmp_wgt,
              tmp_edge_id,
-             tmp_edge_type) =
+             tmp_edge_type,
+             std::ignore) =
       cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
         handle.raft_handle(),
         store_transposed ? std::move(dst_[0]) : std::move(src_[0]),
diff --git a/cpp/src/c_api/graph_functions.cpp b/cpp/src/c_api/graph_functions.cpp
index 91371b988b3..df741a349d2 100644
--- a/cpp/src/c_api/graph_functions.cpp
+++ b/cpp/src/c_api/graph_functions.cpp
@@ -72,7 +72,7 @@ struct create_vertex_pairs_functor : public cugraph::c_api::abstract_functor {
         second_copy.data(), second_->as_type<vertex_t>(), second_->size_, handle_.get_stream());
 
       if constexpr (multi_gpu) {
-        std::tie(first_copy, second_copy, std::ignore, std::ignore, std::ignore) =
+        std::tie(first_copy, second_copy, std::ignore, std::ignore, std::ignore, std::ignore) =
           cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
             vertex_t,
             edge_t,
diff --git a/cpp/src/c_api/graph_mg.cpp b/cpp/src/c_api/graph_mg.cpp
index 22ceea3f629..cc4acd31743 100644
--- a/cpp/src/c_api/graph_mg.cpp
+++ b/cpp/src/c_api/graph_mg.cpp
@@ -167,7 +167,8 @@ struct create_graph_functor : public cugraph::c_api::abstract_functor {
                store_transposed ? edgelist_srcs : edgelist_dsts,
                edgelist_weights,
                edgelist_edge_ids,
-               edgelist_edge_types) =
+               edgelist_edge_types,
+               std::ignore) =
         cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
           handle_,
           std::move(store_transposed ? edgelist_dsts : edgelist_srcs),
diff --git a/cpp/src/c_api/k_truss.cpp b/cpp/src/c_api/k_truss.cpp
index 18e256b022a..37a0672676e 100644
--- a/cpp/src/c_api/k_truss.cpp
+++ b/cpp/src/c_api/k_truss.cpp
@@ -60,10 +60,7 @@ struct k_truss_functor : public cugraph::c_api::abstract_functor {
   {
     if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
       unsupported();
-    } else if constexpr (multi_gpu) {
-      unsupported();
     } else {
-      // k_truss expects store_transposed == false
       if constexpr (store_transposed) {
         error_code_ = cugraph::c_api::
           transpose_storage<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index 99fc1cd6fae..272e3d71f83 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -627,6 +627,7 @@ refine_clustering(
                store_transposed ? d_srcs : d_dsts,
                d_weights,
                std::ignore,
+               std::ignore,
                std::ignore) =
         cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
           vertex_t,
diff --git a/cpp/src/community/edge_triangle_count_impl.cuh b/cpp/src/community/edge_triangle_count_impl.cuh
index c4277e240be..225687c4cf0 100644
--- a/cpp/src/community/edge_triangle_count_impl.cuh
+++ b/cpp/src/community/edge_triangle_count_impl.cuh
@@ -250,7 +250,7 @@ edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_t
                          handle.get_stream());
 
       // There are still multiple copies here but is it worth sorting and reducing again?
-      std::tie(pair_srcs, pair_dsts, std::ignore, pair_count, std::ignore) =
+      std::tie(pair_srcs, pair_dsts, std::ignore, pair_count, std::ignore, std::ignore) =
         shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                edge_t,
                                                                                weight_t,
diff --git a/cpp/src/community/k_truss_impl.cuh b/cpp/src/community/k_truss_impl.cuh
index f2d4c04dffd..e052a892917 100644
--- a/cpp/src/community/k_truss_impl.cuh
+++ b/cpp/src/community/k_truss_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,12 +17,14 @@
 
 #include "prims/edge_bucket.cuh"
 #include "prims/extract_transform_e.cuh"
+#include "prims/extract_transform_v_frontier_outgoing_e.cuh"
 #include "prims/fill_edge_property.cuh"
 #include "prims/transform_e.cuh"
 #include "prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 
 #include <cugraph/algorithms.hpp>
+#include <cugraph/detail/collect_comm_wrapper.hpp>
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/utilities/error.hpp>
@@ -41,351 +43,6 @@
 
 namespace cugraph {
 
-template <typename vertex_t, typename edge_t, typename EdgeIterator>
-struct unroll_edge {
-  size_t num_valid_edges{};
-  raft::device_span<edge_t> num_triangles{};
-  EdgeIterator edge_to_unroll_first{};
-  EdgeIterator transposed_valid_edge_first{};
-  EdgeIterator transposed_valid_edge_last{};
-  EdgeIterator transposed_invalid_edge_last{};
-
-  __device__ thrust::tuple<vertex_t, vertex_t> operator()(edge_t i) const
-  {
-    // edges are sorted with destination as key so reverse the edge when looking it
-    auto pair = thrust::make_tuple(thrust::get<1>(*(edge_to_unroll_first + i)),
-                                   thrust::get<0>(*(edge_to_unroll_first + i)));
-    // Find its position in either partition of the transposed edgelist
-    // An edge can be in found in either of the two partitions (valid or invalid)
-    auto itr = thrust::lower_bound(
-      thrust::seq, transposed_valid_edge_last, transposed_invalid_edge_last, pair);
-    size_t idx{};
-    if (itr != transposed_invalid_edge_last && *itr == pair) {
-      idx =
-        static_cast<size_t>(thrust::distance(transposed_valid_edge_last, itr) + num_valid_edges);
-    } else {
-      // The edge must be in the first boundary
-      itr = thrust::lower_bound(
-        thrust::seq, transposed_valid_edge_first, transposed_valid_edge_last, pair);
-      assert(*itr == pair);
-      idx = thrust::distance(transposed_valid_edge_first, itr);
-    }
-    cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(num_triangles[idx]);
-    auto r = atomic_counter.fetch_sub(edge_t{1}, cuda::std::memory_order_relaxed);
-  }
-};
-
-// FIXME: May re-locate this function as a general utility function for graph algorithm
-// implementations.
-template <typename vertex_t>
-rmm::device_uvector<vertex_t> compute_prefix_sum(raft::handle_t const& handle,
-                                                 raft::device_span<vertex_t const> sorted_vertices,
-                                                 raft::device_span<vertex_t const> query_vertices)
-{
-  rmm::device_uvector<vertex_t> prefix_sum(query_vertices.size() + 1, handle.get_stream());
-
-  auto count_first = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(size_t{0}),
-    cuda::proclaim_return_type<vertex_t>(
-      [query_vertices,
-       num_edges       = sorted_vertices.size(),
-       sorted_vertices = sorted_vertices.begin()] __device__(size_t idx) {
-        auto itr_lower = thrust::lower_bound(
-          thrust::seq, sorted_vertices, sorted_vertices + num_edges, query_vertices[idx]);
-
-        auto itr_upper = thrust::upper_bound(
-          thrust::seq, itr_lower, sorted_vertices + num_edges, query_vertices[idx]);
-        vertex_t dist = thrust::distance(itr_lower, itr_upper);
-
-        return dist;
-      }));
-
-  thrust::exclusive_scan(handle.get_thrust_policy(),
-                         count_first,
-                         count_first + query_vertices.size() + 1,
-                         prefix_sum.begin());
-
-  return prefix_sum;
-}
-
-template <typename vertex_t, typename edge_t, typename EdgeIterator>
-edge_t remove_overcompensating_edges(raft::handle_t const& handle,
-                                     size_t buffer_size,
-                                     EdgeIterator potential_closing_or_incoming_edges,
-                                     EdgeIterator incoming_or_potential_closing_edges,
-                                     raft::device_span<vertex_t const> invalid_edgelist_srcs,
-                                     raft::device_span<vertex_t const> invalid_edgelist_dsts)
-{
-  // To avoid over-compensating, check whether the 'potential_closing_edges'
-  // are within the invalid edges. If yes, the was already unrolled
-  auto edges_not_overcomp = thrust::remove_if(
-    handle.get_thrust_policy(),
-    thrust::make_zip_iterator(potential_closing_or_incoming_edges,
-                              incoming_or_potential_closing_edges),
-    thrust::make_zip_iterator(potential_closing_or_incoming_edges + buffer_size,
-                              incoming_or_potential_closing_edges + buffer_size),
-    [num_invalid_edges = invalid_edgelist_dsts.size(),
-     invalid_first =
-       thrust::make_zip_iterator(invalid_edgelist_dsts.begin(), invalid_edgelist_srcs.begin()),
-     invalid_last = thrust::make_zip_iterator(invalid_edgelist_dsts.end(),
-                                              invalid_edgelist_srcs.end())] __device__(auto e) {
-      auto potential_edge = thrust::get<0>(e);
-      auto transposed_potential_or_incoming_edge =
-        thrust::make_tuple(thrust::get<1>(potential_edge), thrust::get<0>(potential_edge));
-      auto itr = thrust::lower_bound(
-        thrust::seq, invalid_first, invalid_last, transposed_potential_or_incoming_edge);
-      return (itr != invalid_last && *itr == transposed_potential_or_incoming_edge);
-    });
-
-  auto dist = thrust::distance(thrust::make_zip_iterator(potential_closing_or_incoming_edges,
-                                                         incoming_or_potential_closing_edges),
-                               edges_not_overcomp);
-
-  return dist;
-}
-
-template <typename vertex_t, typename edge_t, bool multi_gpu, bool is_q_r_edge>
-void unroll_p_r_or_q_r_edges(raft::handle_t const& handle,
-                             graph_view_t<vertex_t, edge_t, false, false>& graph_view,
-                             size_t num_invalid_edges,
-                             size_t num_valid_edges,
-                             raft::device_span<vertex_t const> edgelist_srcs,
-                             raft::device_span<vertex_t const> edgelist_dsts,
-                             raft::device_span<edge_t> num_triangles)
-{
-  auto prefix_sum_valid = compute_prefix_sum(
-    handle,
-    raft::device_span<vertex_t const>(edgelist_dsts.data(), num_valid_edges),
-    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
-
-  auto prefix_sum_invalid = compute_prefix_sum(
-    handle,
-    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges),
-    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
-
-  auto potential_closing_edges = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
-    prefix_sum_valid.back_element(handle.get_stream()) +
-      prefix_sum_invalid.back_element(handle.get_stream()),
-    handle.get_stream());
-
-  auto incoming_edges_to_r = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
-    prefix_sum_valid.back_element(handle.get_stream()) +
-      prefix_sum_invalid.back_element(handle.get_stream()),
-    handle.get_stream());
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(num_invalid_edges),
-    [num_valid_edges,
-     num_invalid_edges,
-     invalid_dst_first       = edgelist_dsts.begin() + num_valid_edges,
-     invalid_src_first       = edgelist_srcs.begin() + num_valid_edges,
-     valid_src_first         = edgelist_srcs.begin(),
-     valid_dst_first         = edgelist_dsts.begin(),
-     prefix_sum_valid        = prefix_sum_valid.data(),
-     prefix_sum_invalid      = prefix_sum_invalid.data(),
-     potential_closing_edges = get_dataframe_buffer_begin(potential_closing_edges),
-     incoming_edges_to_r = get_dataframe_buffer_begin(incoming_edges_to_r)] __device__(auto idx) {
-      auto src                 = invalid_src_first[idx];
-      auto dst                 = invalid_dst_first[idx];
-      auto dst_array_end_valid = valid_dst_first + num_valid_edges;
-
-      auto itr_lower_valid =
-        thrust::lower_bound(thrust::seq, valid_dst_first, dst_array_end_valid, dst);
-      auto idx_lower_valid = thrust::distance(
-        valid_dst_first,
-        itr_lower_valid);  // Need a binary search to find the begining of the range
-
-      auto invalid_end_dst = invalid_dst_first + num_invalid_edges;
-
-      auto itr_lower_invalid =
-        thrust::lower_bound(thrust::seq, invalid_dst_first, invalid_end_dst, dst);
-      auto idx_lower_invalid = thrust::distance(
-        invalid_dst_first,
-        itr_lower_invalid);  // Need a binary search to find the begining of the range
-
-      auto incoming_edges_to_r_first_valid = thrust::make_zip_iterator(
-        valid_src_first + idx_lower_valid, thrust::make_constant_iterator(dst));
-      thrust::copy(
-        thrust::seq,
-        incoming_edges_to_r_first_valid,
-        incoming_edges_to_r_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
-        incoming_edges_to_r + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
-
-      auto incoming_edges_to_r_first_invalid = thrust::make_zip_iterator(
-        invalid_src_first + idx_lower_invalid, thrust::make_constant_iterator(dst));
-      thrust::copy(
-        thrust::seq,
-        incoming_edges_to_r_first_invalid,
-        incoming_edges_to_r_first_invalid + (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
-
-        incoming_edges_to_r + prefix_sum_invalid[idx] + prefix_sum_valid[idx + 1]);
-
-      if constexpr (is_q_r_edge) {
-        auto potential_closing_edges_first_valid = thrust::make_zip_iterator(
-          valid_src_first + idx_lower_valid, thrust::make_constant_iterator(src));
-        thrust::copy(
-          thrust::seq,
-          potential_closing_edges_first_valid,
-          potential_closing_edges_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
-          potential_closing_edges + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
-
-        auto potential_closing_edges_first_invalid = thrust::make_zip_iterator(
-          invalid_src_first + idx_lower_invalid, thrust::make_constant_iterator(src));
-        thrust::copy(thrust::seq,
-                     potential_closing_edges_first_invalid,
-                     potential_closing_edges_first_invalid +
-                       (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
-                     potential_closing_edges + prefix_sum_invalid[idx] + prefix_sum_valid[idx + 1]);
-
-      } else {
-        auto potential_closing_edges_first_valid = thrust::make_zip_iterator(
-          thrust::make_constant_iterator(src), valid_src_first + idx_lower_valid);
-        thrust::copy(
-          thrust::seq,
-          potential_closing_edges_first_valid,
-          potential_closing_edges_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
-          potential_closing_edges + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
-
-        auto potential_closing_edges_first_invalid = thrust::make_zip_iterator(
-          thrust::make_constant_iterator(src), invalid_src_first + idx_lower_invalid);
-        thrust::copy(
-          thrust::seq,
-          potential_closing_edges_first_invalid,
-          potential_closing_edges_first_invalid +
-            (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
-          potential_closing_edges + prefix_sum_invalid[idx] + (prefix_sum_valid[idx + 1]));
-      }
-    });
-
-  auto edges_exist = graph_view.has_edge(
-    handle,
-    raft::device_span<vertex_t const>(std::get<0>(potential_closing_edges).data(),
-                                      std::get<0>(potential_closing_edges).size()),
-    raft::device_span<vertex_t const>(std::get<1>(potential_closing_edges).data(),
-                                      std::get<1>(potential_closing_edges).size()));
-
-  auto edge_to_existance = thrust::make_zip_iterator(
-    thrust::make_zip_iterator(get_dataframe_buffer_begin(potential_closing_edges),
-                              get_dataframe_buffer_begin(incoming_edges_to_r)),
-    edges_exist.begin());
-
-  auto has_edge_last = thrust::remove_if(handle.get_thrust_policy(),
-                                         edge_to_existance,
-                                         edge_to_existance + edges_exist.size(),
-                                         [] __device__(auto e) {
-                                           auto edge_exists = thrust::get<1>(e);
-                                           return edge_exists == 0;
-                                         });
-
-  auto num_edge_exists = thrust::distance(edge_to_existance, has_edge_last);
-
-  // After pushing the non-existant edges to the second partition,
-  // remove them by resizing  both vertex pair buffer
-  resize_dataframe_buffer(potential_closing_edges, num_edge_exists, handle.get_stream());
-  resize_dataframe_buffer(incoming_edges_to_r, num_edge_exists, handle.get_stream());
-
-  auto num_edges_not_overcomp =
-    remove_overcompensating_edges<vertex_t,
-                                  edge_t,
-                                  decltype(get_dataframe_buffer_begin(potential_closing_edges))>(
-      handle,
-      num_edge_exists,
-      get_dataframe_buffer_begin(potential_closing_edges),
-      get_dataframe_buffer_begin(incoming_edges_to_r),
-      raft::device_span<vertex_t const>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
-      raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
-
-  // After pushing the non-existant edges to the second partition,
-  // remove them by resizing  both vertex pair buffer
-  resize_dataframe_buffer(potential_closing_edges, num_edges_not_overcomp, handle.get_stream());
-  resize_dataframe_buffer(incoming_edges_to_r, num_edges_not_overcomp, handle.get_stream());
-
-  // Extra check for 'incoming_edges_to_r'
-  if constexpr (!is_q_r_edge) {
-    // Exchange the arguments (incoming_edges_to_r, num_edges_not_overcomp) order
-    // To also check if the 'incoming_edges_to_r' belong the the invalid_edgelist
-    num_edges_not_overcomp =
-      remove_overcompensating_edges<vertex_t,
-                                    edge_t,
-                                    decltype(get_dataframe_buffer_begin(potential_closing_edges))>(
-        handle,
-        num_edges_not_overcomp,
-        get_dataframe_buffer_begin(incoming_edges_to_r),
-        get_dataframe_buffer_begin(potential_closing_edges),
-        raft::device_span<vertex_t const>(edgelist_srcs.data() + num_valid_edges,
-                                          num_invalid_edges),
-        raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges,
-                                          num_invalid_edges));
-
-    resize_dataframe_buffer(potential_closing_edges, num_edges_not_overcomp, handle.get_stream());
-    resize_dataframe_buffer(incoming_edges_to_r, num_edges_not_overcomp, handle.get_stream());
-  }
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_zip_iterator(get_dataframe_buffer_begin(potential_closing_edges),
-                              get_dataframe_buffer_begin(incoming_edges_to_r)),
-    thrust::make_zip_iterator(
-      get_dataframe_buffer_begin(potential_closing_edges) + num_edges_not_overcomp,
-      get_dataframe_buffer_begin(incoming_edges_to_r) + num_edges_not_overcomp),
-    [num_triangles = num_triangles.begin(),
-     num_valid_edges,
-     invalid_first = thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
-                                               edgelist_srcs.begin() + num_valid_edges),
-     invalid_last  = thrust::make_zip_iterator(
-       edgelist_dsts.end(), edgelist_srcs.end())] __device__(auto potential_or_incoming_e) {
-      auto potential_e     = thrust::get<0>(potential_or_incoming_e);
-      auto incoming_e_to_r = thrust::get<1>(potential_or_incoming_e);
-      // thrust::tuple<vertex_t, vertex_t>> transposed_invalid_edge_;
-      auto transposed_invalid_edge =
-        thrust::make_tuple(thrust::get<1>(incoming_e_to_r), thrust::get<1>(potential_e));
-
-      if constexpr (!is_q_r_edge) {
-        transposed_invalid_edge =
-          thrust::make_tuple(thrust::get<1>(incoming_e_to_r), thrust::get<0>(potential_e));
-      }
-      auto itr =
-        thrust::lower_bound(thrust::seq, invalid_first, invalid_last, transposed_invalid_edge);
-      if (itr != invalid_last) { assert(*itr == transposed_invalid_edge); }
-      auto dist = thrust::distance(invalid_first, itr) + num_valid_edges;
-
-      cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(num_triangles[dist]);
-      auto r = atomic_counter.fetch_sub(edge_t{1}, cuda::std::memory_order_relaxed);
-    });
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(num_edges_not_overcomp),
-    unroll_edge<vertex_t,
-                edge_t,
-                decltype(thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin()))>{
-      num_valid_edges,
-      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
-      get_dataframe_buffer_begin(potential_closing_edges),
-      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin()),
-      thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
-                                edgelist_srcs.begin() + num_valid_edges),
-      thrust::make_zip_iterator(edgelist_dsts.end(), edgelist_srcs.end())});
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(num_edges_not_overcomp),
-    unroll_edge<vertex_t,
-                edge_t,
-                decltype(thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin()))>{
-      num_valid_edges,
-      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
-      get_dataframe_buffer_begin(incoming_edges_to_r),
-      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin()),
-      thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
-                                edgelist_srcs.begin() + num_valid_edges),
-      thrust::make_zip_iterator(edgelist_dsts.end(), edgelist_srcs.end())});
-}
-
 namespace {
 
 template <typename vertex_t>
@@ -434,28 +91,6 @@ struct extract_low_to_high_degree_edges_t {
   }
 };
 
-template <typename vertex_t, typename edge_t, bool generate_p_r>
-struct generate_p_r_or_q_r_from_p_q {
-  size_t chunk_start{};
-  raft::device_span<size_t const> intersection_offsets{};
-  raft::device_span<vertex_t const> intersection_indices{};
-  raft::device_span<vertex_t const> invalid_srcs{};
-  raft::device_span<vertex_t const> invalid_dsts{};
-
-  __device__ thrust::tuple<vertex_t, vertex_t> operator()(edge_t i) const
-  {
-    auto itr = thrust::upper_bound(
-      thrust::seq, intersection_offsets.begin() + 1, intersection_offsets.end(), i);
-    auto idx = thrust::distance(intersection_offsets.begin() + 1, itr);
-
-    if constexpr (generate_p_r) {
-      return thrust::make_tuple(invalid_srcs[chunk_start + idx], intersection_indices[i]);
-
-    } else {
-      return thrust::make_tuple(invalid_dsts[chunk_start + idx], intersection_indices[i]);
-    }
-  }
-};
 }  // namespace
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
@@ -470,8 +105,6 @@ k_truss(raft::handle_t const& handle,
 {
   // 1. Check input arguments.
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   CUGRAPH_EXPECTS(graph_view.is_symmetric(),
                   "Invalid input arguments: K-truss currently supports undirected graphs only.");
   CUGRAPH_EXPECTS(!graph_view.is_multigraph(),
@@ -497,7 +130,7 @@ k_truss(raft::handle_t const& handle,
                                             exclude_self_loop_t<vertex_t>{});
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
@@ -520,7 +153,8 @@ k_truss(raft::handle_t const& handle,
     modified_graph_view = (*modified_graph).view();
   }
 
-  // 3. Find (k-1)-core and exclude edges that do not belong to (k-1)-core
+  // 2. Find (k-1)-core and exclude edges that do not belong to (k-1)-core
+
   {
     auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
 
@@ -536,26 +170,23 @@ k_truss(raft::handle_t const& handle,
 
     raft::device_span<edge_t const> core_number_span{core_numbers.data(), core_numbers.size()};
 
-    rmm::device_uvector<vertex_t> srcs{0, handle.get_stream()};
-    rmm::device_uvector<vertex_t> dsts{0, handle.get_stream()};
-    std::tie(srcs, dsts, wgts) = k_core(handle,
-                                        cur_graph_view,
-                                        edge_weight_view,
-                                        k - 1,
-                                        std::make_optional(k_core_degree_type_t::OUT),
-                                        std::make_optional(core_number_span));
+    auto [srcs, dsts, wgts] = k_core(handle,
+                                     cur_graph_view,
+                                     edge_weight_view,
+                                     k - 1,
+                                     std::make_optional(k_core_degree_type_t::OUT),
+                                     std::make_optional(core_number_span));
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, wgts, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
                                                                                        int32_t>(
-          handle, std::move(srcs), std::move(dsts), std::nullopt, std::nullopt, std::nullopt);
+          handle, std::move(srcs), std::move(dsts), std::move(wgts), std::nullopt, std::nullopt);
     }
 
     std::optional<rmm::device_uvector<vertex_t>> tmp_renumber_map{std::nullopt};
-
     std::tie(*modified_graph, edge_weight, std::ignore, std::ignore, tmp_renumber_map) =
       create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, multi_gpu>(
         handle,
@@ -566,7 +197,7 @@ k_truss(raft::handle_t const& handle,
         std::nullopt,
         std::nullopt,
         cugraph::graph_properties_t{true, graph_view.is_multigraph()},
-        false);
+        true);
 
     modified_graph_view = (*modified_graph).view();
 
@@ -577,10 +208,11 @@ k_truss(raft::handle_t const& handle,
                                                    (*renumber_map).data(),
                                                    *vertex_partition_range_lasts);
     }
+
     renumber_map = std::move(tmp_renumber_map);
   }
 
-  // 4. Keep only the edges from a low-degree vertex to a high-degree vertex.
+  // 3. Keep only the edges from a low-degree vertex to a high-degree vertex.
 
   {
     auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
@@ -625,7 +257,7 @@ k_truss(raft::handle_t const& handle,
     }
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, wgts, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, wgts, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
@@ -645,7 +277,7 @@ k_truss(raft::handle_t const& handle,
         std::nullopt,
         std::nullopt,
         cugraph::graph_properties_t{false /* now asymmetric */, cur_graph_view.is_multigraph()},
-        false);
+        true);
 
     modified_graph_view = (*modified_graph).view();
     if (renumber_map) {  // collapse renumber_map
@@ -658,264 +290,49 @@ k_truss(raft::handle_t const& handle,
     renumber_map = std::move(tmp_renumber_map);
   }
 
-  // 5. Decompress the resulting graph to an edges list and ind intersection of edges endpoints
-  // for each partition using detail::nbr_intersection
+  // 4. Compute triangle count using nbr_intersection and unroll weak edges
 
   {
     auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
-    rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
-    rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
-    std::optional<rmm::device_uvector<edge_t>> num_triangles{std::nullopt};
-    std::optional<rmm::device_uvector<weight_t>> edgelist_wgts{std::nullopt};
 
     edge_weight_view =
       edge_weight ? std::make_optional((*edge_weight).view())
                   : std::optional<edge_property_view_t<edge_t, weight_t const*>>{std::nullopt};
 
-    auto prop_num_triangles = edge_triangle_count<vertex_t, edge_t, false>(handle, cur_graph_view);
-
-    std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts, num_triangles, std::ignore) =
-      decompress_to_edgelist(
-        handle,
-        cur_graph_view,
-        edge_weight_view,
-        // FIXME: Update 'decompress_edgelist' to support int32_t and int64_t values
-        std::make_optional(prop_num_triangles.view()),
-        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
-        std::optional<raft::device_span<vertex_t const>>(std::nullopt));
-    auto transposed_edge_first =
-      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin());
+    cugraph::edge_property_t<decltype(cur_graph_view), bool> edge_mask(handle, cur_graph_view);
+    cugraph::fill_edge_property(handle, cur_graph_view, edge_mask.mutable_view(), bool{true});
 
-    auto edge_first = thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin());
+    while (true) {
+      // FIXME: This approach is very expensive when invalidating only few edges per iteration
+      // and should be address.
+      auto edge_triangle_counts =
+        edge_triangle_count<vertex_t, edge_t, multi_gpu>(handle, cur_graph_view);
 
-    auto transposed_edge_triangle_count_pair_first =
-      thrust::make_zip_iterator(transposed_edge_first, (*num_triangles).begin());
+      // Mask all the edges that have k - 2 count
 
-    thrust::sort_by_key(handle.get_thrust_policy(),
-                        transposed_edge_first,
-                        transposed_edge_first + edgelist_srcs.size(),
-                        (*num_triangles).begin());
+      auto prev_number_of_edges = cur_graph_view.compute_number_of_edges(handle);
 
-    cugraph::edge_property_t<decltype(cur_graph_view), bool> edge_mask(handle, cur_graph_view);
-    cugraph::fill_edge_property(handle, cur_graph_view, edge_mask.mutable_view(), true);
-    cur_graph_view.attach_edge_mask(edge_mask.view());
-
-    while (true) {
-      // 'invalid_transposed_edge_triangle_count_first' marks the beginning of the edges to be
-      // removed 'invalid_transposed_edge_triangle_count_first' + edgelist_srcs.size() marks the end
-      // of the edges to be removed 'edge_triangle_count_pair_first' marks the begining of the valid
-      // edges.
-      auto invalid_transposed_edge_triangle_count_first =
-        thrust::stable_partition(handle.get_thrust_policy(),
-                                 transposed_edge_triangle_count_pair_first,
-                                 transposed_edge_triangle_count_pair_first + edgelist_srcs.size(),
-                                 [k] __device__(auto e) {
-                                   auto num_triangles = thrust::get<1>(e);
-                                   return num_triangles >= k - 2;
-                                 });
-      auto num_invalid_edges = static_cast<size_t>(
-        thrust::distance(invalid_transposed_edge_triangle_count_first,
-                         transposed_edge_triangle_count_pair_first + edgelist_srcs.size()));
-
-      if (num_invalid_edges == 0) { break; }
-
-      auto num_valid_edges = edgelist_srcs.size() - num_invalid_edges;
-
-      // case 1. For the (p, q), find intersection 'r'.
-
-      // nbr_intersection requires the edges to be sort by 'src'
-      // sort the invalid edges by src for nbr intersection
-      size_t edges_to_intersect_per_iteration =
-        static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 17);
-
-      size_t prev_chunk_size         = 0;
-      size_t chunk_num_invalid_edges = num_invalid_edges;
-
-      auto num_chunks =
-        raft::div_rounding_up_safe(edgelist_srcs.size(), edges_to_intersect_per_iteration);
-
-      for (size_t i = 0; i < num_chunks; ++i) {
-        auto chunk_size = std::min(edges_to_intersect_per_iteration, chunk_num_invalid_edges);
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            edge_first + num_valid_edges,
-                            edge_first + edgelist_srcs.size(),
-                            (*num_triangles).begin() + num_valid_edges);
-
-        auto [intersection_offsets, intersection_indices] =
-          detail::nbr_intersection(handle,
-                                   cur_graph_view,
-                                   cugraph::edge_dummy_property_t{}.view(),
-                                   edge_first + num_valid_edges + prev_chunk_size,
-                                   edge_first + num_valid_edges + prev_chunk_size + chunk_size,
-                                   std::array<bool, 2>{true, true},
-                                   do_expensive_check);
-
-        // Update the number of triangles of each (p, q) edges by looking at their intersection
-        // size.
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator<edge_t>(0),
-          thrust::make_counting_iterator<edge_t>(chunk_size),
-          [chunk_start   = prev_chunk_size,
-           num_triangles = raft::device_span<edge_t>((*num_triangles).data() + num_valid_edges,
-                                                     num_invalid_edges),
-           intersection_offsets = raft::device_span<size_t const>(
-             intersection_offsets.data(), intersection_offsets.size())] __device__(auto i) {
-            num_triangles[chunk_start + i] -=
-              (intersection_offsets[i + 1] - intersection_offsets[i]);
-          });
-
-        // FIXME: Find a way to not have to maintain a dataframe_buffer
-        auto vertex_pair_buffer_p_r_edge_p_q =
-          allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
-                                                                       handle.get_stream());
-        thrust::tabulate(
-          handle.get_thrust_policy(),
-          get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
-          get_dataframe_buffer_end(vertex_pair_buffer_p_r_edge_p_q),
-          generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, true>{
-            prev_chunk_size,
-            raft::device_span<size_t const>(intersection_offsets.data(),
-                                            intersection_offsets.size()),
-            raft::device_span<vertex_t const>(intersection_indices.data(),
-                                              intersection_indices.size()),
-            raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
-            raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges,
-                                        num_invalid_edges)});
-
-        auto vertex_pair_buffer_q_r_edge_p_q =
-          allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
-                                                                       handle.get_stream());
-        thrust::tabulate(
-          handle.get_thrust_policy(),
-          get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
-          get_dataframe_buffer_end(vertex_pair_buffer_q_r_edge_p_q),
-          generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, false>{
-            prev_chunk_size,
-            raft::device_span<size_t const>(intersection_offsets.data(),
-                                            intersection_offsets.size()),
-            raft::device_span<vertex_t const>(intersection_indices.data(),
-                                              intersection_indices.size()),
-            raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
-            raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges,
-                                        num_invalid_edges)});
-
-        // Unrolling the edges require the edges to be sorted by destination
-        // re-sort the invalid edges by 'dst'
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            transposed_edge_first + num_valid_edges,
-                            transposed_edge_first + edgelist_srcs.size(),
-                            (*num_triangles).begin() + num_valid_edges);
-
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator<edge_t>(0),
-          thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
-          unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
-            num_valid_edges,
-            raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()),
-            get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
-            transposed_edge_first,
-            transposed_edge_first + num_valid_edges,
-            transposed_edge_first + edgelist_srcs.size()});
-
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator<edge_t>(0),
-          thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
-          unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
-            num_valid_edges,
-            raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()),
-            get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
-            transposed_edge_first,
-            transposed_edge_first + num_valid_edges,
-            transposed_edge_first + edgelist_srcs.size()});
-
-        prev_chunk_size += chunk_size;
-        chunk_num_invalid_edges -= chunk_size;
-      }
-      // case 2: unroll (q, r)
-      // For each (q, r) edges to unroll, find the incoming edges to 'r' let's say from 'p' and
-      // create the pair (p, q)
-      cugraph::unroll_p_r_or_q_r_edges<vertex_t, edge_t, false, true>(
-        handle,
-        cur_graph_view,
-        num_invalid_edges,
-        num_valid_edges,
-        raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
-        raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()),
-        raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()));
-
-      // case 3: unroll (p, r)
-      cugraph::unroll_p_r_or_q_r_edges<vertex_t, edge_t, false, false>(
+      cugraph::transform_e(
         handle,
         cur_graph_view,
-        num_invalid_edges,
-        num_valid_edges,
-        raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
-        raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()),
-        raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()));
-
-      // Remove edges that have a triangle count of zero. Those should not be accounted
-      // for during the unroling phase.
-      auto edges_with_triangle_last = thrust::stable_partition(
-        handle.get_thrust_policy(),
-        transposed_edge_triangle_count_pair_first,
-        transposed_edge_triangle_count_pair_first + (*num_triangles).size(),
-        [] __device__(auto e) {
-          auto num_triangles = thrust::get<1>(e);
-          return num_triangles > 0;
-        });
-
-      auto num_edges_with_triangles = static_cast<size_t>(
-        thrust::distance(transposed_edge_triangle_count_pair_first, edges_with_triangle_last));
-
-      thrust::sort(handle.get_thrust_policy(),
-                   thrust::make_zip_iterator(edgelist_srcs.begin() + num_edges_with_triangles,
-                                             edgelist_dsts.begin() + num_edges_with_triangles),
-                   thrust::make_zip_iterator(edgelist_srcs.end(), edgelist_dsts.end()));
-
-      cugraph::edge_bucket_t<vertex_t, void, true, multi_gpu, true> edges_with_no_triangle(handle);
-      edges_with_no_triangle.insert(edgelist_srcs.begin() + num_edges_with_triangles,
-                                    edgelist_srcs.end(),
-                                    edgelist_dsts.begin() + num_edges_with_triangles);
-
-      cur_graph_view.clear_edge_mask();
-      if (edge_weight_view) {
-        cugraph::transform_e(
-          handle,
-          cur_graph_view,
-          edges_with_no_triangle,
-          cugraph::edge_src_dummy_property_t{}.view(),
-          cugraph::edge_dst_dummy_property_t{}.view(),
-          *edge_weight_view,
-          [] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, auto wgt) {
-            return false;
-          },
-          edge_mask.mutable_view(),
-          false);
-      } else {
-        cugraph::transform_e(
-          handle,
-          cur_graph_view,
-          edges_with_no_triangle,
-          cugraph::edge_src_dummy_property_t{}.view(),
-          cugraph::edge_dst_dummy_property_t{}.view(),
-          cugraph::edge_dummy_property_t{}.view(),
-          [] __device__(
-            auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) {
-            return false;
-          },
-          edge_mask.mutable_view(),
-          false);
-      }
+        cugraph::edge_src_dummy_property_t{}.view(),
+        cugraph::edge_dst_dummy_property_t{}.view(),
+        edge_triangle_counts.view(),
+        [k] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, auto count) {
+          return count >= k - 2;
+        },
+        edge_mask.mutable_view(),
+        false);
+
       cur_graph_view.attach_edge_mask(edge_mask.view());
 
-      edgelist_srcs.resize(num_edges_with_triangles, handle.get_stream());
-      edgelist_dsts.resize(num_edges_with_triangles, handle.get_stream());
-      (*num_triangles).resize(num_edges_with_triangles, handle.get_stream());
+      if (prev_number_of_edges == cur_graph_view.compute_number_of_edges(handle)) { break; }
     }
 
+    rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+    std::optional<rmm::device_uvector<weight_t>> edgelist_wgts{std::nullopt};
+
     std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts, std::ignore, std::ignore) =
       decompress_to_edgelist(
         handle,
@@ -923,7 +340,8 @@ k_truss(raft::handle_t const& handle,
         edge_weight_view ? std::make_optional(*edge_weight_view) : std::nullopt,
         std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
         std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
-        std::optional<raft::device_span<vertex_t const>>(std::nullopt));
+        std::make_optional(
+          raft::device_span<vertex_t const>((*renumber_map).data(), (*renumber_map).size())));
 
     std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts) =
       symmetrize_edgelist<vertex_t, weight_t, false, multi_gpu>(handle,
diff --git a/cpp/src/community/k_truss_mg_v32_e32.cu b/cpp/src/community/k_truss_mg_v32_e32.cu
new file mode 100644
index 00000000000..4feb69f6098
--- /dev/null
+++ b/cpp/src/community/k_truss_mg_v32_e32.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "community/k_truss_impl.cuh"
+
+namespace cugraph {
+
+// MG instantiation
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+        int32_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+        int32_t k,
+        bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/k_truss_mg_v32_e64.cu b/cpp/src/community/k_truss_mg_v32_e64.cu
new file mode 100644
index 00000000000..b07f9382612
--- /dev/null
+++ b/cpp/src/community/k_truss_mg_v32_e64.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "community/k_truss_impl.cuh"
+
+namespace cugraph {
+
+// MG instantiation
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/k_truss_mg_v64_e64.cu b/cpp/src/community/k_truss_mg_v64_e64.cu
new file mode 100644
index 00000000000..1c730fe272d
--- /dev/null
+++ b/cpp/src/community/k_truss_mg_v64_e64.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "community/k_truss_impl.cuh"
+
+namespace cugraph {
+
+// MG instantiation
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/triangle_count_impl.cuh b/cpp/src/community/triangle_count_impl.cuh
index 0b453cfe262..e902901cd36 100644
--- a/cpp/src/community/triangle_count_impl.cuh
+++ b/cpp/src/community/triangle_count_impl.cuh
@@ -439,7 +439,7 @@ void triangle_count(raft::handle_t const& handle,
                                             extract_low_to_high_degree_edges_t<vertex_t, edge_t>{});
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 681523dad90..468f4f7280f 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -682,6 +682,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
                  std::get<1>(edge_buffer),
                  std::ignore,
                  std::ignore,
+                 std::ignore,
                  std::ignore) =
           detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
             vertex_t,
diff --git a/cpp/src/cores/k_core_impl.cuh b/cpp/src/cores/k_core_impl.cuh
index 06402cc3382..2c5bf987a47 100644
--- a/cpp/src/cores/k_core_impl.cuh
+++ b/cpp/src/cores/k_core_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,6 @@ k_core(raft::handle_t const& handle,
        std::optional<raft::device_span<edge_t const>> core_numbers,
        bool do_expensive_check)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   rmm::device_uvector<edge_t> computed_core_numbers(0, handle.get_stream());
 
   if (!core_numbers) {
diff --git a/cpp/src/link_prediction/similarity_impl.cuh b/cpp/src/link_prediction/similarity_impl.cuh
index 487f31e5e03..b39895129dc 100644
--- a/cpp/src/link_prediction/similarity_impl.cuh
+++ b/cpp/src/link_prediction/similarity_impl.cuh
@@ -408,7 +408,7 @@ all_pairs_similarity(raft::handle_t const& handle,
           // shuffle vertex pairs
           auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
 
-          std::tie(v1, v2, std::ignore, std::ignore, std::ignore) =
+          std::tie(v1, v2, std::ignore, std::ignore, std::ignore, std::ignore) =
             detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                            edge_t,
                                                                                            weight_t,
@@ -599,7 +599,7 @@ all_pairs_similarity(raft::handle_t const& handle,
       // shuffle vertex pairs
       auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
 
-      std::tie(v1, v2, std::ignore, std::ignore, std::ignore) =
+      std::tie(v1, v2, std::ignore, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
index 64b6aab9baf..43415ba6df4 100644
--- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
+++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
@@ -1573,11 +1573,11 @@ uniform_sample_and_compute_local_nbr_indices(
   size_t K,
   bool with_replacement)
 {
+  using edge_t = typename GraphViewType::edge_type;
 #ifndef NO_CUGRAPH_OPS
   assert(cugraph::invalid_edge_id_v<edge_t> == cugraph::ops::graph::INVALID_ID<edge_t>);
 
   using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
   using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
 
   int minor_comm_size{1};
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index fb1dee1a92f..ed0a70e570f 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -349,7 +349,8 @@ coarsen_graph(raft::handle_t const& handle,
 
     // 1-2. globally shuffle
 
-    std::tie(edgelist_majors, edgelist_minors, edgelist_weights, std::ignore, std::ignore) =
+    std::tie(
+      edgelist_majors, edgelist_minors, edgelist_weights, std::ignore, std::ignore, std::ignore) =
       cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
         edge_t,
@@ -475,6 +476,7 @@ coarsen_graph(raft::handle_t const& handle,
              reversed_edgelist_minors,
              reversed_edgelist_weights,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/src/structure/symmetrize_edgelist_impl.cuh b/cpp/src/structure/symmetrize_edgelist_impl.cuh
index a6a4c0947c7..1fd566938eb 100644
--- a/cpp/src/structure/symmetrize_edgelist_impl.cuh
+++ b/cpp/src/structure/symmetrize_edgelist_impl.cuh
@@ -295,6 +295,7 @@ symmetrize_edgelist(raft::handle_t const& handle,
              upper_triangular_majors,
              upper_triangular_weights,
              std::ignore,
+             std::ignore,
              std::ignore) =
       detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                      vertex_t,
@@ -481,6 +482,7 @@ symmetrize_edgelist(raft::handle_t const& handle,
              upper_triangular_minors,
              upper_triangular_weights,
              std::ignore,
+             std::ignore,
              std::ignore) =
       detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                      vertex_t,
diff --git a/cpp/src/structure/transpose_graph_impl.cuh b/cpp/src/structure/transpose_graph_impl.cuh
index a1eef536c43..6786b52faed 100644
--- a/cpp/src/structure/transpose_graph_impl.cuh
+++ b/cpp/src/structure/transpose_graph_impl.cuh
@@ -96,6 +96,7 @@ transpose_graph_impl(
            store_transposed ? edgelist_dsts : edgelist_srcs,
            edgelist_weights,
            std::ignore,
+           std::ignore,
            std::ignore) =
     detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                    edge_t,
diff --git a/cpp/src/structure/transpose_graph_storage_impl.cuh b/cpp/src/structure/transpose_graph_storage_impl.cuh
index f8e479f28ff..374bfd05c40 100644
--- a/cpp/src/structure/transpose_graph_storage_impl.cuh
+++ b/cpp/src/structure/transpose_graph_storage_impl.cuh
@@ -96,6 +96,7 @@ transpose_graph_storage_impl(
            !store_transposed ? edgelist_srcs : edgelist_dsts,
            edgelist_weights,
            std::ignore,
+           std::ignore,
            std::ignore) =
     detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                    edge_t,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs.cuh b/cpp/src/utilities/shuffle_vertex_pairs.cuh
index 8cfefa1a060..70327db5ffb 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs.cuh
+++ b/cpp/src/utilities/shuffle_vertex_pairs.cuh
@@ -42,7 +42,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_vertex_pairs_with_values_by_gpu_id_impl(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -178,25 +179,27 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
                     handle.get_stream());
   handle.sync_stream();
 
+  std::vector<size_t> rx_counts{};
+
   if (mem_frugal_flag) {  // trade-off potential parallelism to lower peak memory
-    std::tie(majors, std::ignore) =
+    std::tie(majors, rx_counts) =
       shuffle_values(comm, majors.begin(), h_tx_value_counts, handle.get_stream());
 
-    std::tie(minors, std::ignore) =
+    std::tie(minors, rx_counts) =
       shuffle_values(comm, minors.begin(), h_tx_value_counts, handle.get_stream());
 
     if (weights) {
-      std::tie(weights, std::ignore) =
+      std::tie(weights, rx_counts) =
         shuffle_values(comm, (*weights).begin(), h_tx_value_counts, handle.get_stream());
     }
 
     if (edge_ids) {
-      std::tie(edge_ids, std::ignore) =
+      std::tie(edge_ids, rx_counts) =
         shuffle_values(comm, (*edge_ids).begin(), h_tx_value_counts, handle.get_stream());
     }
 
     if (edge_types) {
-      std::tie(edge_types, std::ignore) =
+      std::tie(edge_types, rx_counts) =
         shuffle_values(comm, (*edge_types).begin(), h_tx_value_counts, handle.get_stream());
     }
   } else {
@@ -204,7 +207,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
       if (edge_ids) {
         if (edge_types) {
           std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids, edge_types),
-                                std::ignore) =
+                                rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(majors.begin(),
                                                      minors.begin(),
@@ -214,7 +217,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
                            h_tx_value_counts,
                            handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids), rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(
                              majors.begin(), minors.begin(), weights->begin(), edge_ids->begin()),
@@ -223,14 +226,14 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
         }
       } else {
         if (edge_types) {
-          std::forward_as_tuple(std::tie(majors, minors, weights, edge_types), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors, weights, edge_types), rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(
                              majors.begin(), minors.begin(), weights->begin(), edge_types->begin()),
                            h_tx_value_counts,
                            handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors, weights), std::ignore) = shuffle_values(
+          std::forward_as_tuple(std::tie(majors, minors, weights), rx_counts) = shuffle_values(
             comm,
             thrust::make_zip_iterator(majors.begin(), minors.begin(), weights->begin()),
             h_tx_value_counts,
@@ -240,7 +243,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
     } else {
       if (edge_ids) {
         if (edge_types) {
-          std::forward_as_tuple(std::tie(majors, minors, edge_ids, edge_types), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors, edge_ids, edge_types), rx_counts) =
             shuffle_values(
               comm,
               thrust::make_zip_iterator(
@@ -248,7 +251,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
               h_tx_value_counts,
               handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors, edge_ids), std::ignore) = shuffle_values(
+          std::forward_as_tuple(std::tie(majors, minors, edge_ids), rx_counts) = shuffle_values(
             comm,
             thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_ids->begin()),
             h_tx_value_counts,
@@ -256,13 +259,13 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
         }
       } else {
         if (edge_types) {
-          std::forward_as_tuple(std::tie(majors, minors, edge_types), std::ignore) = shuffle_values(
+          std::forward_as_tuple(std::tie(majors, minors, edge_types), rx_counts) = shuffle_values(
             comm,
             thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_types->begin()),
             h_tx_value_counts,
             handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors), rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(majors.begin(), minors.begin()),
                            h_tx_value_counts,
@@ -276,7 +279,8 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
                          std::move(minors),
                          std::move(weights),
                          std::move(edge_ids),
-                         std::move(edge_types));
+                         std::move(edge_types),
+                         std::move(rx_counts));
 }
 
 }  // namespace
@@ -288,7 +292,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -320,7 +325,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -366,7 +372,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<vertex_t>&& edge_srcs,
                        rmm::device_uvector<vertex_t>&& edge_dsts,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu
index 7943ebcd5f4..db5a7c0e9bd 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu
+++ b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu
@@ -36,7 +36,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -49,7 +50,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -62,7 +64,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -76,7 +79,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -92,7 +96,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
@@ -104,7 +109,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu
index 230fce435f9..d79b2379224 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu
+++ b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu
@@ -35,7 +35,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -48,7 +49,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -61,7 +63,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -75,7 +78,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -91,7 +95,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
@@ -103,7 +108,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu b/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu
index 8134c41696b..605976f7076 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu
+++ b/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu
@@ -36,7 +36,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -49,7 +50,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -62,7 +64,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -76,7 +79,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -92,7 +96,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int64_t>&& majors,
                        rmm::device_uvector<int64_t>&& minors,
@@ -104,7 +109,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int64_t>&& majors,
                        rmm::device_uvector<int64_t>&& minors,
diff --git a/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu b/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu
index 0c91eb546d6..db7be5a3031 100644
--- a/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu
+++ b/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu
@@ -40,6 +40,13 @@ shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int32_t>&& d_values,
   std::vector<int32_t> const& vertex_partition_range_lasts);
 
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int64_t>>
+shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& d_vertices,
+  rmm::device_uvector<int64_t>&& d_values,
+  std::vector<int32_t> const& vertex_partition_range_lasts);
+
 template rmm::device_uvector<int32_t> shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle, rmm::device_uvector<int32_t>&& d_vertices);
 
diff --git a/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu b/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu
index 5abce7c0783..7d968006bc7 100644
--- a/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu
+++ b/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu
@@ -35,6 +35,13 @@ shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int32_t>&& d_values,
   std::vector<int64_t> const& vertex_partition_range_lasts);
 
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& d_vertices,
+  rmm::device_uvector<int64_t>&& d_values,
+  std::vector<int64_t> const& vertex_partition_range_lasts);
+
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int32_t>>
 shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 892ba91af86..52d257b9bea 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -500,7 +500,7 @@ ConfigureTest(CORE_NUMBER_TEST cores/core_number_test.cpp)
 ConfigureTest(K_CORE_TEST cores/k_core_test.cpp)
 
 ###################################################################################################
-# - K-truss tests ---------------------------------------------------------------------------------
+# - K-truss tests --------------------------------------------------------------------------
 ConfigureTest(K_TRUSS_TEST community/k_truss_test.cpp)
 
 ###################################################################################################
@@ -619,6 +619,10 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG EDGE TRIANGLE COUNT tests --------------------------------------------------------------
     ConfigureTestMG(MG_EDGE_TRIANGLE_COUNT_TEST community/mg_edge_triangle_count_test.cpp)
 
+    ###############################################################################################
+    # - MG K-TRUSS tests --------------------------------------------------------------------------
+    ConfigureTestMG(MG_K_TRUSS_TEST community/mg_k_truss_test.cpp)
+
     ###############################################################################################
     # - MG WEAKLY CONNECTED COMPONENTS tests ------------------------------------------------------
     ConfigureTestMG(MG_WEAKLY_CONNECTED_COMPONENTS_TEST
@@ -783,6 +787,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureCTestMG(MG_CAPI_COUNT_MULTI_EDGES c_api/mg_count_multi_edges_test.c)
     ConfigureCTestMG(MG_CAPI_EGONET_TEST c_api/mg_egonet_test.c)
     ConfigureCTestMG(MG_CAPI_TWO_HOP_NEIGHBORS_TEST c_api/mg_two_hop_neighbors_test.c)
+    ConfigureCTestMG(MG_CAPI_K_TRUSS c_api/mg_k_truss_test.c)
 
     rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing_mg DESTINATION bin/gtests/libcugraph_mg)
 
diff --git a/cpp/tests/c_api/mg_k_truss_test.c b/cpp/tests/c_api/mg_k_truss_test.c
new file mode 100644
index 00000000000..e406eb330a7
--- /dev/null
+++ b/cpp/tests/c_api/mg_k_truss_test.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mg_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/graph.h>
+#include <cugraph_c/graph_functions.h>
+
+#include <stdio.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+/*
+ * Simple check of creating a graph from a COO on device memory.
+ */
+int generic_k_truss_test(const cugraph_resource_handle_t* handle,
+                         vertex_t* h_src,
+                         vertex_t* h_dst,
+                         weight_t* h_wgt,
+                         size_t num_edges,
+                         size_t num_results,
+                         size_t k,
+                         bool_t store_transposed,
+                         vertex_t* h_result_src,
+                         vertex_t* h_result_dst,
+                         weight_t* h_result_wgt)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_graph_t* graph = NULL;
+
+  cugraph_induced_subgraph_result_t* result = NULL;
+
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t size_t_tid = SIZE_T;
+
+  ret_code = create_mg_test_graph(
+    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, TRUE, &graph, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  ret_code = cugraph_k_truss_subgraph(handle, graph, k, FALSE, &result, &ret_error);
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_k_truss failed.");
+
+  cugraph_type_erased_device_array_view_t* k_truss_src;
+  cugraph_type_erased_device_array_view_t* k_truss_dst;
+  cugraph_type_erased_device_array_view_t* k_truss_wgt;
+
+  k_truss_src = cugraph_induced_subgraph_get_sources(result);
+  k_truss_dst = cugraph_induced_subgraph_get_destinations(result);
+  k_truss_wgt = cugraph_induced_subgraph_get_edge_weights(result);
+
+  size_t k_truss_size = cugraph_type_erased_device_array_view_size(k_truss_src);
+
+  vertex_t h_k_truss_src[k_truss_size];
+  vertex_t h_k_truss_dst[k_truss_size];
+  weight_t h_k_truss_wgt[k_truss_size];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_k_truss_src, k_truss_src, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_k_truss_dst, k_truss_dst, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_k_truss_wgt, k_truss_wgt, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  for (size_t i = 0; (i < k_truss_size) && (test_ret_value == 0); ++i) {
+    bool_t found = FALSE;
+    for (size_t j = 0; (j < num_results) && !found; ++j) {
+      if ((h_k_truss_src[i] == h_result_src[j]) && (h_k_truss_dst[i] == h_result_dst[j]) &&
+          (h_k_truss_wgt[i] == h_result_wgt[j]))
+        found = TRUE;
+    }
+    TEST_ASSERT(test_ret_value, found, "k_truss subgraph has an edge that doesn't match");
+  }
+
+  cugraph_induced_subgraph_result_free(result);
+  cugraph_mg_graph_free(graph);
+  cugraph_error_free(ret_error);
+  return test_ret_value;
+}
+
+int test_k_truss_subgraph(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges    = 14;
+  size_t num_vertices = 7;
+  size_t num_results  = 6;
+  size_t k            = 3;
+
+  vertex_t h_src[] = {0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 4, 5, 6};
+  vertex_t h_dst[] = {1, 2, 5, 0, 2, 3, 4, 6, 0, 1, 1, 1, 0, 1};
+  weight_t h_wgt[] = {
+    1.2f, 1.3f, 1.6f, 1.2f, 2.3f, 2.4f, 2.5f, 2.7f, 1.3f, 2.3f, 2.4f, 2.5f, 1.6f, 2.7f};
+
+  vertex_t h_result_src[] = {0, 2, 2, 1, 1, 0};
+  vertex_t h_result_dst[] = {1, 1, 0, 0, 2, 2};
+  weight_t h_result_wgt[] = {1.2f, 2.3f, 1.3f, 1.2f, 2.3f, 1.3f};
+
+  return generic_k_truss_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_edges,
+                              num_results,
+                              k,
+                              FALSE,
+                              h_result_src,
+                              h_result_dst,
+                              h_result_wgt);
+}
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  void* raft_handle                 = create_mg_raft_handle(argc, argv);
+  cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle);
+
+  int result = 0;
+  result |= RUN_MG_TEST(test_k_truss_subgraph, handle);
+
+  cugraph_free_resource_handle(handle);
+  free_mg_raft_handle(raft_handle);
+
+  return result;
+}
diff --git a/cpp/tests/community/mg_k_truss_test.cpp b/cpp/tests/community/mg_k_truss_test.cpp
new file mode 100644
index 00000000000..a1624949007
--- /dev/null
+++ b/cpp/tests/community/mg_k_truss_test.cpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/thrust_wrapper.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+struct KTruss_Usecase {
+  int32_t k_{3};
+  bool test_weighted_{false};
+  bool edge_masking_{false};
+  bool check_correctness_{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGKTruss
+  : public ::testing::TestWithParam<std::tuple<KTruss_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGKTruss() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running KTruss on multiple GPUs to that of a single-GPU run
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(KTruss_Usecase const& k_truss_usecase, input_usecase_t const& input_usecase)
+  {
+    using weight_t = float;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    auto [mg_graph, edge_weight, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, k_truss_usecase.test_weighted_, true, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (k_truss_usecase.edge_masking_) {
+      edge_mask = cugraph::test::generate<decltype(mg_graph_view), bool>::edge_property(
+        *handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // 2. run MG KTruss
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG KTruss");
+    }
+
+    auto mg_edge_weight_view =
+      edge_weight ? std::make_optional((*edge_weight).view()) : std::nullopt;
+    auto [d_cugraph_srcs, d_cugraph_dsts, d_cugraph_wgts] =
+      cugraph::k_truss<vertex_t, edge_t, weight_t, true>(
+        *handle_, mg_graph_view, mg_edge_weight_view, k_truss_usecase.k_, false);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 3. Compare SG & MG results
+
+    if (k_truss_usecase.check_correctness_) {
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_cugraph_srcs.data(),
+        d_cugraph_srcs.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_cugraph_dsts.data(),
+        d_cugraph_dsts.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+
+      auto global_d_cugraph_srcs = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_cugraph_srcs.data(), d_cugraph_srcs.size()));
+
+      auto global_d_cugraph_dsts = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_cugraph_dsts.data(), d_cugraph_srcs.size()));
+
+      rmm::device_uvector<vertex_t> d_sorted_cugraph_srcs{0, handle_->get_stream()};
+      rmm::device_uvector<vertex_t> d_sorted_cugraph_dsts{0, handle_->get_stream()};
+      rmm::device_uvector<weight_t> d_sorted_cugraph_wgts{0, handle_->get_stream()};
+
+      if (edge_weight) {
+        auto global_d_cugraph_wgts = cugraph::test::device_gatherv(
+          *handle_,
+          raft::device_span<weight_t const>((*d_cugraph_wgts).data(), (*d_cugraph_wgts).size()));
+
+        std::tie(d_sorted_cugraph_srcs, d_sorted_cugraph_dsts, d_sorted_cugraph_wgts) =
+          cugraph::test::sort_by_key<vertex_t, weight_t>(
+            *handle_, global_d_cugraph_srcs, global_d_cugraph_dsts, global_d_cugraph_wgts);
+
+      } else {
+        std::tie(d_sorted_cugraph_srcs, d_sorted_cugraph_dsts) =
+          cugraph::test::sort<vertex_t>(*handle_, global_d_cugraph_srcs, global_d_cugraph_dsts);
+      }
+
+      // 3-1. Convert to SG graph
+      auto [sg_graph, sg_edge_weights, sg_edge_ids, sg_number_map] =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
+
+      auto sg_edge_weight_view =
+        sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+      if (handle_->get_comms().get_rank() == int{0}) {
+        auto sg_graph_view = sg_graph.view();
+
+        // 3-2. Run SG KTruss
+        auto [ref_d_cugraph_srcs, ref_d_cugraph_dsts, ref_d_cugraph_wgts] =
+          cugraph::k_truss<vertex_t, edge_t, weight_t, false>(
+            *handle_, sg_graph_view, sg_edge_weight_view, k_truss_usecase.k_, false);
+
+        rmm::device_uvector<vertex_t> d_sorted_ref_cugraph_srcs{0, handle_->get_stream()};
+        rmm::device_uvector<vertex_t> d_sorted_ref_cugraph_dsts{0, handle_->get_stream()};
+        rmm::device_uvector<weight_t> d_sorted_ref_cugraph_wgts{0, handle_->get_stream()};
+
+        if (edge_weight) {
+          std::tie(
+            d_sorted_ref_cugraph_srcs, d_sorted_ref_cugraph_dsts, d_sorted_ref_cugraph_wgts) =
+            cugraph::test::sort_by_key<vertex_t, weight_t>(
+              *handle_, ref_d_cugraph_srcs, ref_d_cugraph_dsts, *ref_d_cugraph_wgts);
+
+        } else {
+          std::tie(d_sorted_ref_cugraph_srcs, d_sorted_ref_cugraph_dsts) =
+            cugraph::test::sort<vertex_t>(*handle_, ref_d_cugraph_srcs, ref_d_cugraph_dsts);
+        }
+
+        // 3-3. Compare
+        auto h_cugraph_srcs     = cugraph::test::to_host(*handle_, d_sorted_cugraph_srcs);
+        auto h_cugraph_dsts     = cugraph::test::to_host(*handle_, d_sorted_cugraph_dsts);
+        auto ref_h_cugraph_srcs = cugraph::test::to_host(*handle_, d_sorted_ref_cugraph_srcs);
+        auto ref_h_cugraph_dsts = cugraph::test::to_host(*handle_, d_sorted_ref_cugraph_dsts);
+
+        ASSERT_TRUE(
+          std::equal(h_cugraph_srcs.begin(), h_cugraph_srcs.end(), ref_h_cugraph_srcs.begin()));
+
+        ASSERT_TRUE(
+          std::equal(h_cugraph_dsts.begin(), h_cugraph_dsts.end(), ref_h_cugraph_dsts.begin()));
+
+        if (edge_weight) {
+          auto ref_h_cugraph_wgts = cugraph::test::to_host(*handle_, d_sorted_ref_cugraph_wgts);
+
+          auto h_cugraph_wgts = cugraph::test::to_host(*handle_, d_sorted_cugraph_wgts);
+
+          ASSERT_TRUE(
+            std::equal(h_cugraph_wgts.begin(), h_cugraph_wgts.end(), ref_h_cugraph_wgts.begin()));
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGKTruss<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGKTruss_File = Tests_MGKTruss<cugraph::test::File_Usecase>;
+using Tests_MGKTruss_Rmat = Tests_MGKTruss<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGKTruss_File, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGKTruss_Rmat, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGKTruss_Rmat, CheckInt32Int64)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGKTruss_Rmat, CheckInt64Int64)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_tests,
+  Tests_MGKTruss_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(KTruss_Usecase{4, false, true, true}, KTruss_Usecase{5, true, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_tests,
+  Tests_MGKTruss_Rmat,
+  ::testing::Combine(
+    ::testing::Values(KTruss_Usecase{4, false, false, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGKTruss_Rmat,
+  ::testing::Combine(
+    ::testing::Values(KTruss_Usecase{4, false, false, false},
+                      KTruss_Usecase{5, false, false, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/link_prediction/mg_similarity_test.cpp b/cpp/tests/link_prediction/mg_similarity_test.cpp
index 8f674e6a6de..3bcabb6b6df 100644
--- a/cpp/tests/link_prediction/mg_similarity_test.cpp
+++ b/cpp/tests/link_prediction/mg_similarity_test.cpp
@@ -106,7 +106,7 @@ class Tests_MGSimilarity
     auto d_v1 = cugraph::test::to_device(*handle_, h_v1);
     auto d_v2 = std::move(two_hop_nbrs);
 
-    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore) =
+    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore, std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
         edge_t,
diff --git a/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp b/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp
index 192caa5227e..730a3ac8f08 100644
--- a/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp
+++ b/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp
@@ -108,7 +108,7 @@ class Tests_MGSimilarity
     auto d_v1 = cugraph::test::to_device(*handle_, h_v1);
     auto d_v2 = std::move(two_hop_nbrs);
 
-    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore) =
+    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore, std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
         edge_t,
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
index 681a7d8e6ff..fc6369ec721 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
@@ -149,6 +149,7 @@ class Tests_MGPerVPairTransformDstNbrIntersection
              std::get<1>(mg_vertex_pair_buffer),
              std::ignore,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
index 48bbc6176d8..06a23880d81 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
@@ -175,6 +175,7 @@ class Tests_MGPerVPairTransformDstNbrIntersection
              std::get<1>(mg_vertex_pair_buffer),
              std::ignore,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
index 3d3d881fb23..b8ad06dd18b 100644
--- a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
+++ b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
@@ -123,6 +123,7 @@ class Tests_MGHasEdgeAndComputeMultiplicity
              store_transposed ? d_mg_edge_srcs : d_mg_edge_dsts,
              std::ignore,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index b6898fbaf78..0a706d1cf80 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -335,6 +335,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
                  store_transposed ? tmp_src_v : tmp_dst_v,
                  tmp_weights_v,
                  std::ignore,
+                 std::ignore,
                  std::ignore) =
           cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
             vertex_t,
diff --git a/dependencies.yaml b/dependencies.yaml
index 1ef92274bde..82affb08dac 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -493,7 +493,7 @@ dependencies:
           - requests
           - nccl>=2.9.9
           - ucx-proc=*=gpu
-          - &ucx_py ucx-py==0.39.*,>=0.0.0a0
+          - &ucx_py_unsuffixed ucx-py==0.39.*,>=0.0.0a0
       - output_types: pyproject
         packages:
             # cudf uses fsspec but is protocol independent. cugraph
@@ -504,15 +504,17 @@ dependencies:
         matrices:
           - matrix:
               cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - &ucx_py_cu11 ucx-py-cu11==0.39.*,>=0.0.0a0
           - matrix:
               cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - &ucx_py_cu12 ucx-py-cu12==0.39.*,>=0.0.0a0
           - matrix:
             packages:
-              - *ucx_py
+              - *ucx_py_unsuffixed
   python_run_nx_cugraph:
     common:
       - output_types: [conda, pyproject]
@@ -530,15 +532,17 @@ dependencies:
         matrices:
           - matrix:
               cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - &cugraph_cu11 cugraph-cu11==24.8.*,>=0.0.0a0
           - matrix:
               cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - &cugraph_cu12 cugraph-cu12==24.8.*,>=0.0.0a0
           - matrix:
             packages:
-              - &cugraph cugraph==24.8.*,>=0.0.0a0
+              - &cugraph_unsuffixed cugraph==24.8.*,>=0.0.0a0
   python_run_cugraph_pyg:
     common:
       - output_types: [conda, pyproject]
@@ -550,22 +554,22 @@ dependencies:
         matrices:
           - matrix:
               cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - *cugraph_cu11
           - matrix:
               cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - *cugraph_cu12
           - matrix:
             packages:
-              - *cugraph
+              - *cugraph_unsuffixed
   python_run_cugraph_service_client:
     common:
       - output_types: [conda, pyproject]
         packages:
-          # this thriftpy2 entry can be removed entirely (or switched to a '!=')
-          # once a new release of that project resolves https://github.com/Thriftpy/thriftpy2/issues/281
-          - &thrift thriftpy2<=0.5.0
+          - &thrift thriftpy2!=0.5.0,!=0.5.1
   python_run_cugraph_service_server:
     common:
       - output_types: [conda, pyproject]
@@ -575,27 +579,31 @@ dependencies:
           - *numba
           - *numpy
           - *thrift
-      - output_types: pyproject
-        packages:
-          - *cugraph
-          - cugraph-service-client==24.8.*,>=0.0.0a0
       - output_types: conda
         packages:
-          - *ucx_py
+          - *ucx_py_unsuffixed
     specific:
       - output_types: pyproject
         matrices:
           - matrix:
               cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
+              - *cugraph_cu11
+              - cugraph-service-client-cu11==24.8.*,>=0.0.0a0
               - *ucx_py_cu11
           - matrix:
               cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
+              - *cugraph_cu12
+              - cugraph-service-client-cu12==24.8.*,>=0.0.0a0
               - *ucx_py_cu12
           - matrix:
             packages:
-              - *ucx_py
+              - *cugraph_unsuffixed
+              - cugraph-service-client==24.8.*,>=0.0.0a0
+              - *ucx_py_unsuffixed
   test_cpp:
     common:
       - output_types: conda
@@ -630,7 +638,7 @@ dependencies:
           - scikit-learn>=0.23.1
       - output_types: [conda]
         packages:
-          - &pylibwholegraph_conda pylibwholegraph==24.8.*,>=0.0.0a0
+          - &pylibwholegraph_unsuffixed pylibwholegraph==24.8.*,>=0.0.0a0
           - *thrift
   test_python_pylibcugraph:
     common:
@@ -648,7 +656,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==24.8.*,>=0.0.0a0
+          - *cugraph_unsuffixed
           - pytorch>=2.0
           - pytorch-cuda==11.8
           - dgl>=1.1.0.cu*
@@ -656,7 +664,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==24.8.*,>=0.0.0a0
+          - *cugraph_unsuffixed
           - pytorch>=2.0
           - pytorch-cuda==11.8
           - &tensordict tensordict>=0.1.2
@@ -666,7 +674,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - &pytorch_conda pytorch>=2.0,<2.2.0a0
+          - &pytorch_unsuffixed pytorch>=2.0,<2.2.0a0
 
     specific:
       - output_types: [requirements]
@@ -694,7 +702,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - *pylibwholegraph_conda
+          - *pylibwholegraph_unsuffixed
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -703,19 +711,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - pylibwholegraph-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - pylibwholegraph-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*pylibwholegraph_conda]}
+          - {matrix: null, packages: [*pylibwholegraph_unsuffixed]}
 
   depends_on_rmm:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.8.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -724,19 +736,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - rmm-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - rmm-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_conda]}
+          - {matrix: null, packages: [*rmm_unsuffixed]}
 
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.8.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -745,19 +761,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - cudf-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - cudf-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*cudf_conda]}
+          - {matrix: null, packages: [*cudf_unsuffixed]}
 
   depends_on_dask_cudf:
     common:
       - output_types: conda
         packages:
-          - &dask_cudf_conda dask-cudf==24.8.*,>=0.0.0a0
+          - &dask_cudf_unsuffixed dask-cudf==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -766,19 +786,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - dask-cudf-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - dask-cudf-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*dask_cudf_conda]}
+          - {matrix: null, packages: [*dask_cudf_unsuffixed]}
 
   depends_on_pylibraft:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_conda pylibraft==24.8.*,>=0.0.0a0
+          - &pylibraft_unsuffixed pylibraft==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -787,19 +811,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - pylibraft-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - pylibraft-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*pylibraft_conda]}
+          - {matrix: null, packages: [*pylibraft_unsuffixed]}
 
   depends_on_raft_dask:
     common:
       - output_types: conda
         packages:
-          - &raft_dask_conda raft-dask==24.8.*,>=0.0.0a0
+          - &raft_dask_unsuffixed raft-dask==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -808,19 +836,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - raft-dask-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - raft-dask-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*raft_dask_conda]}
+          - {matrix: null, packages: [*raft_dask_unsuffixed]}
 
   depends_on_pylibcugraph:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraph_conda pylibcugraph==24.8.*,>=0.0.0a0
+          - &pylibcugraph_unsuffixed pylibcugraph==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -829,19 +861,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - pylibcugraph-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - pylibcugraph-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*pylibcugraph_conda]}
+          - {matrix: null, packages: [*pylibcugraph_unsuffixed]}
 
   depends_on_pylibcugraphops:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraphops_conda pylibcugraphops==24.8.*,>=0.0.0a0
+          - &pylibcugraphops_unsuffixed pylibcugraphops==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -850,19 +886,26 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - pylibcugraphops-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - pylibcugraphops-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*pylibcugraphops_conda]}
+          - {matrix: null, packages: [*pylibcugraphops_unsuffixed]}
 
   depends_on_cupy:
     common:
       - output_types: conda
         packages:
           - cupy>=12.0.0
+    # NOTE: This is intentionally not broken into groups by a 'cuda_suffixed' selector like
+    #       other packages with -cu{nn}x suffixes in this file.
+    #       All RAPIDS wheel builds (including in devcontainers) expect cupy to be suffixed.
     specific:
       - output_types: [requirements, pyproject]
         matrices:
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index 1762b1e6d8e..b5a17e425a9 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -61,3 +61,4 @@ include = [
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-equivariant/pyproject.toml b/python/cugraph-equivariant/pyproject.toml
index f9c992e3fb9..151ad79ea6c 100644
--- a/python/cugraph-equivariant/pyproject.toml
+++ b/python/cugraph-equivariant/pyproject.toml
@@ -69,3 +69,4 @@ include = [
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index 8ccd305a6bd..c929c9c49be 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -67,3 +67,4 @@ include = [
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-service/client/pyproject.toml b/python/cugraph-service/client/pyproject.toml
index e805a2aac95..75deea88e2e 100644
--- a/python/cugraph-service/client/pyproject.toml
+++ b/python/cugraph-service/client/pyproject.toml
@@ -20,7 +20,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "thriftpy2<=0.5.0",
+    "thriftpy2!=0.5.0,!=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -49,3 +49,4 @@ include = [
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../../dependencies.yaml"
 disable-cuda = true
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index 2369c47b6d6..c16e4589f25 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -30,7 +30,7 @@ dependencies = [
     "numpy>=1.23,<2.0a0",
     "rapids-dask-dependency==24.8.*,>=0.0.0a0",
     "rmm==24.8.*,>=0.0.0a0",
-    "thriftpy2<=0.5.0",
+    "thriftpy2!=0.5.0,!=0.5.1",
     "ucx-py==0.39.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -77,3 +77,4 @@ include = [
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph/cugraph/dask/centrality/katz_centrality.py b/python/cugraph/cugraph/dask/centrality/katz_centrality.py
index a11be3b6870..6616670c6b6 100644
--- a/python/cugraph/cugraph/dask/centrality/katz_centrality.py
+++ b/python/cugraph/cugraph/dask/centrality/katz_centrality.py
@@ -162,7 +162,7 @@ def katz_centrality(
     do_expensive_check = False
 
     initial_hubs_guess_values = None
-    if nstart:
+    if nstart is not None:
         if input_graph.renumbered:
             if len(input_graph.renumber_map.implementation.col_names) > 1:
                 cols = nstart.columns[:-1].to_list()
diff --git a/python/cugraph/cugraph/tests/community/test_triangle_count.py b/python/cugraph/cugraph/tests/community/test_triangle_count.py
index 449df32b52a..69cd5fd72e4 100644
--- a/python/cugraph/cugraph/tests/community/test_triangle_count.py
+++ b/python/cugraph/cugraph/tests/community/test_triangle_count.py
@@ -105,48 +105,32 @@ def test_triangles(input_combo):
 @pytest.mark.sg
 def test_triangles_int64(input_combo):
     Gnx = input_combo["Gnx"]
-    count_legacy_32 = cugraph.triangle_count(Gnx)
+    count_int32 = cugraph.triangle_count(Gnx)["counts"].sum()
 
     graph_file = input_combo["graph_file"]
     G = graph_file.get_graph()
     G.edgelist.edgelist_df = G.edgelist.edgelist_df.astype(
         {"src": "int64", "dst": "int64"}
     )
+    count_int64 = cugraph.triangle_count(G)["counts"].sum()
 
-    count_exp_64 = (
-        cugraph.triangle_count(G)
-        .sort_values("vertex")
-        .reset_index(drop=True)
-        .rename(columns={"counts": "exp_cugraph_counts"})
-    )
-    cugraph_exp_triangle_results = count_exp_64["exp_cugraph_counts"].sum()
     assert G.edgelist.edgelist_df["src"].dtype == "int64"
     assert G.edgelist.edgelist_df["dst"].dtype == "int64"
-    assert cugraph_exp_triangle_results == count_legacy_32
+    assert count_int32 == count_int64
 
 
 @pytest.mark.sg
 def test_triangles_no_weights(input_combo):
     G_weighted = input_combo["Gnx"]
-    count_legacy = (
-        cugraph.triangle_count(G_weighted)
-        .sort_values("vertex")
-        .reset_index(drop=True)
-        .rename(columns={"counts": "exp_cugraph_counts"})
-    )
+    count_triangles_nx_graph = cugraph.triangle_count(G_weighted)["counts"].sum()
 
     graph_file = input_combo["graph_file"]
     G = graph_file.get_graph(ignore_weights=True)
 
     assert G.is_weighted() is False
-    triangle_count = (
-        cugraph.triangle_count(G)
-        .sort_values("vertex")
-        .reset_index(drop=True)
-        .rename(columns={"counts": "exp_cugraph_counts"})
-    )
-    cugraph_exp_triangle_results = triangle_count["exp_cugraph_counts"].sum()
-    assert cugraph_exp_triangle_results == count_legacy
+    count_triangles = cugraph.triangle_count(G)["counts"].sum()
+
+    assert count_triangles_nx_graph == count_triangles
 
 
 @pytest.mark.sg
diff --git a/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py b/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py
index 1892e8a85a6..0a272e445fa 100644
--- a/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py
+++ b/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -60,6 +60,7 @@ def runtest(world_rank: int, world_size: int):
 @pytest.mark.skipif(
     isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
 )
+@pytest.mark.skip(reason="broken")
 def test_feature_storage_wholegraph_backend():
     from pylibwholegraph.utils.multiprocess import multiprocess_run
 
@@ -75,6 +76,7 @@ def test_feature_storage_wholegraph_backend():
 @pytest.mark.skipif(
     isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
 )
+@pytest.mark.skip(reason="broken")
 def test_feature_storage_wholegraph_backend_mg():
     from pylibwholegraph.utils.multiprocess import multiprocess_run
 
diff --git a/python/cugraph/cugraph/tests/traversal/test_paths.py b/python/cugraph/cugraph/tests/traversal/test_paths.py
index 5ee22874f4a..4ef10da593c 100644
--- a/python/cugraph/cugraph/tests/traversal/test_paths.py
+++ b/python/cugraph/cugraph/tests/traversal/test_paths.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -22,6 +22,7 @@
 import cupy
 import cugraph
 from cugraph.testing import get_resultset, load_resultset
+from cudf.testing.testing import assert_series_equal
 from cupyx.scipy.sparse import coo_matrix as cupy_coo_matrix
 
 
@@ -204,7 +205,11 @@ def test_shortest_path_length_no_path(graphs):
 def test_shortest_path_length_no_target(graphs, load_traversal_results):
     cugraph_G, cupy_df = graphs
 
-    cugraph_path_1_to_all = cugraph.shortest_path_length(cugraph_G, 1)
+    cugraph_path_1_to_all = (
+        cugraph.shortest_path_length(cugraph_G, 1)
+        .sort_values("vertex")
+        .reset_index(drop=True)
+    )
     golden_path_1_to_all = get_resultset(
         resultset_name="traversal",
         algo="shortest_path_length",
@@ -217,7 +222,12 @@ def test_shortest_path_length_no_target(graphs, load_traversal_results):
 
     # Cast networkx graph on cugraph vertex column type from str to int.
     # SSSP preserves vertex type, convert for comparison
-    assert cugraph_path_1_to_all == cupy_path_1_to_all
+    assert_series_equal(
+        cugraph_path_1_to_all["distance"],
+        cupy_path_1_to_all["distance"],
+        check_names=False,
+        check_dtype=False,
+    )
 
     # results for vertex 8 and 9 are not returned
     assert cugraph_path_1_to_all.shape[0] == len(golden_path_1_to_all) + 2
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index 218868da000..c3685b8ba1a 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -86,3 +86,4 @@ requires = [
     "rmm==24.8.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index 27825585c28..8a1824a7a0e 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -173,11 +173,19 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/shortest_paths.html#module-networkx.algorithms.shortest_paths.weighted">weighted</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path">all_pairs_bellman_ford_path</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path_length">all_pairs_bellman_ford_path_length</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra.html#networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra">all_pairs_dijkstra</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path.html#networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path">all_pairs_dijkstra_path</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path_length.html#networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path_length">all_pairs_dijkstra_path_length</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.bellman_ford_path">bellman_ford_path</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.bellman_ford_path_length">bellman_ford_path_length</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.dijkstra_path.html#networkx.algorithms.shortest_paths.weighted.dijkstra_path">dijkstra_path</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.dijkstra_path_length.html#networkx.algorithms.shortest_paths.weighted.dijkstra_path_length">dijkstra_path_length</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford">single_source_bellman_ford</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path">single_source_bellman_ford_path</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length">single_source_bellman_ford_path_length</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length">single_source_bellman_ford_path_length</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_dijkstra.html#networkx.algorithms.shortest_paths.weighted.single_source_dijkstra">single_source_dijkstra</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path.html#networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path">single_source_dijkstra_path</a>
+     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path_length.html#networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path_length">single_source_dijkstra_path_length</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/traversal.html">traversal</a>
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/traversal.html#module-networkx.algorithms.traversal.breadth_first_search">breadth_first_search</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.breadth_first_search.bfs_edges.html#networkx.algorithms.traversal.breadth_first_search.bfs_edges">bfs_edges</a>
@@ -253,6 +261,9 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
 <a href="https://networkx.org/documentation/stable/reference/classes/index.html">classes</a>
  └─ <a href="https://networkx.org/documentation/stable/reference/functions.html#module-networkx.classes.function">function</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.classes.function.is_negatively_weighted.html#networkx.classes.function.is_negatively_weighted">is_negatively_weighted</a>
+<a href="https://networkx.org/documentation/stable/reference/convert.html#module-networkx.convert">convert</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert.from_dict_of_lists.html#networkx.convert.from_dict_of_lists">from_dict_of_lists</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert.to_dict_of_lists.html#networkx.convert.to_dict_of_lists">to_dict_of_lists</a>
 <a href="https://networkx.org/documentation/stable/reference/convert.html#module-networkx.convert_matrix">convert_matrix</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_pandas_edgelist.html#networkx.convert_matrix.from_pandas_edgelist">from_pandas_edgelist</a>
  └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_scipy_sparse_array.html#networkx.convert_matrix.from_scipy_sparse_array">from_scipy_sparse_array</a>
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
index f57b90eb402..2d6017fa219 100644
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -42,6 +42,9 @@
         # BEGIN: functions
         "all_pairs_bellman_ford_path",
         "all_pairs_bellman_ford_path_length",
+        "all_pairs_dijkstra",
+        "all_pairs_dijkstra_path",
+        "all_pairs_dijkstra_path_length",
         "all_pairs_shortest_path",
         "all_pairs_shortest_path_length",
         "ancestors",
@@ -75,12 +78,15 @@
         "descendants",
         "descendants_at_distance",
         "diamond_graph",
+        "dijkstra_path",
+        "dijkstra_path_length",
         "dodecahedral_graph",
         "edge_betweenness_centrality",
         "ego_graph",
         "eigenvector_centrality",
         "empty_graph",
         "florentine_families_graph",
+        "from_dict_of_lists",
         "from_pandas_edgelist",
         "from_scipy_sparse_array",
         "frucht_graph",
@@ -131,6 +137,9 @@
         "single_source_bellman_ford",
         "single_source_bellman_ford_path",
         "single_source_bellman_ford_path_length",
+        "single_source_dijkstra",
+        "single_source_dijkstra_path",
+        "single_source_dijkstra_path_length",
         "single_source_shortest_path",
         "single_source_shortest_path_length",
         "single_target_shortest_path",
@@ -138,6 +147,7 @@
         "star_graph",
         "tadpole_graph",
         "tetrahedral_graph",
+        "to_dict_of_lists",
         "transitivity",
         "triangles",
         "trivial_graph",
@@ -171,8 +181,8 @@
         "katz_centrality": "`nstart` isn't used (but is checked), and `normalized=False` is not supported.",
         "louvain_communities": "`seed` parameter is currently ignored, and self-loops are not yet supported.",
         "pagerank": "`dangling` parameter is not supported, but it is checked for validity.",
-        "shortest_path": "Negative weights are not yet supported, and method is ununsed.",
-        "shortest_path_length": "Negative weights are not yet supported, and method is ununsed.",
+        "shortest_path": "Negative weights are not yet supported.",
+        "shortest_path_length": "Negative weights are not yet supported.",
         "single_source_bellman_ford": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
         "single_source_bellman_ford_path": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
         "single_source_bellman_ford_path_length": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
@@ -187,12 +197,27 @@
         "all_pairs_bellman_ford_path_length": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
+        "all_pairs_dijkstra": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "all_pairs_dijkstra_path": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "all_pairs_dijkstra_path_length": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
         "bellman_ford_path": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
         "bellman_ford_path_length": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
+        "dijkstra_path": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "dijkstra_path_length": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
         "ego_graph": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
@@ -227,6 +252,15 @@
         "single_source_bellman_ford_path_length": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
+        "single_source_dijkstra": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "single_source_dijkstra_path": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "single_source_dijkstra_path_length": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
         # END: additional_parameters
     },
 }
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
index 68dbbace93d..7d6d77f34a4 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
@@ -43,7 +43,7 @@ def has_path(G, source, target):
 def shortest_path(
     G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None
 ):
-    """Negative weights are not yet supported, and method is ununsed."""
+    """Negative weights are not yet supported."""
     if method not in {"dijkstra", "bellman-ford"}:
         raise ValueError(f"method not supported: {method}")
     if weight is None:
@@ -53,9 +53,9 @@ def shortest_path(
             # All pairs
             if method == "unweighted":
                 paths = nxcg.all_pairs_shortest_path(G)
-            else:
-                # method == "dijkstra":
-                # method == 'bellman-ford':
+            elif method == "dijkstra":
+                paths = nxcg.all_pairs_dijkstra_path(G, weight=weight, dtype=dtype)
+            else:  # method == 'bellman-ford':
                 paths = nxcg.all_pairs_bellman_ford_path(G, weight=weight, dtype=dtype)
             if nx.__version__[:3] <= "3.4":
                 paths = dict(paths)
@@ -75,9 +75,11 @@ def shortest_path(
         # From source
         if method == "unweighted":
             paths = nxcg.single_source_shortest_path(G, source)
-        else:
-            # method == "dijkstra":
-            # method == 'bellman-ford':
+        elif method == "dijkstra":
+            paths = nxcg.single_source_dijkstra_path(
+                G, source, weight=weight, dtype=dtype
+            )
+        else:  # method == 'bellman-ford':
             paths = nxcg.single_source_bellman_ford_path(
                 G, source, weight=weight, dtype=dtype
             )
@@ -106,7 +108,7 @@ def _(G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None
 def shortest_path_length(
     G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None
 ):
-    """Negative weights are not yet supported, and method is ununsed."""
+    """Negative weights are not yet supported."""
     if method not in {"dijkstra", "bellman-ford"}:
         raise ValueError(f"method not supported: {method}")
     if weight is None:
@@ -116,9 +118,11 @@ def shortest_path_length(
             # All pairs
             if method == "unweighted":
                 lengths = nxcg.all_pairs_shortest_path_length(G)
-            else:
-                # method == "dijkstra":
-                # method == 'bellman-ford':
+            elif method == "dijkstra":
+                lengths = nxcg.all_pairs_dijkstra_path_length(
+                    G, weight=weight, dtype=dtype
+                )
+            else:  # method == 'bellman-ford':
                 lengths = nxcg.all_pairs_bellman_ford_path_length(
                     G, weight=weight, dtype=dtype
                 )
@@ -127,9 +131,11 @@ def shortest_path_length(
             lengths = nxcg.single_target_shortest_path_length(G, target)
             if nx.__version__[:3] <= "3.4":
                 lengths = dict(lengths)
-        else:
-            # method == "dijkstra":
-            # method == 'bellman-ford':
+        elif method == "dijkstra":
+            lengths = nxcg.single_source_dijkstra_path_length(
+                G, target, weight=weight, dtype=dtype
+            )
+        else:  # method == 'bellman-ford':
             lengths = nxcg.single_source_bellman_ford_path_length(
                 G, target, weight=weight, dtype=dtype
             )
@@ -137,21 +143,21 @@ def shortest_path_length(
         # From source
         if method == "unweighted":
             lengths = nxcg.single_source_shortest_path_length(G, source)
-        else:
-            # method == "dijkstra":
-            # method == 'bellman-ford':
-            lengths = dict(
-                nxcg.single_source_bellman_ford_path_length(
-                    G, source, weight=weight, dtype=dtype
-                )
+        elif method == "dijkstra":
+            lengths = nxcg.single_source_dijkstra_path_length(
+                G, source, weight=weight, dtype=dtype
+            )
+        else:  # method == 'bellman-ford':
+            lengths = nxcg.single_source_bellman_ford_path_length(
+                G, source, weight=weight, dtype=dtype
             )
     # From source to target
     elif method == "unweighted":
         G = _to_graph(G)
         lengths = _bfs(G, source, None, "Source", return_type="length", target=target)
-    else:
-        # method == "dijkstra":
-        # method == 'bellman-ford':
+    elif method == "dijkstra":
+        lengths = nxcg.dijkstra_path_length(G, source, target, weight, dtype=dtype)
+    else:  # method == 'bellman-ford':
         lengths = nxcg.bellman_ford_path_length(G, source, target, weight, dtype=dtype)
     return lengths
 
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
index 714289c5b4b..0e98c366e4a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
@@ -61,7 +61,12 @@ def bidirectional_shortest_path(G, source, target):
     # TODO PERF: do bidirectional traversal in core
     G = _to_graph(G)
     if source not in G or target not in G:
-        raise nx.NodeNotFound(f"Either source {source} or target {target} is not in G")
+        if nx.__version__[:3] <= "3.3":
+            raise nx.NodeNotFound(
+                f"Either source {source} or target {target} is not in G"
+            )
+        missing = f"Source {source}" if source not in G else f"Target {target}"
+        raise nx.NodeNotFound(f"{missing} is not in G")
     return _bfs(G, source, None, "Source", return_type="path", target=target)
 
 
@@ -131,7 +136,7 @@ def _bfs(
         # return_type == "length-path"
         return {source: 0}, {source: [source]}
 
-    if cutoff is None:
+    if cutoff is None or np.isinf(cutoff):
         cutoff = -1
     src_index = source if G.key_to_id is None else G.key_to_id[source]
     distances, predecessors, node_ids = plc.bfs(
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py
index 32323dd45f3..032ef2c7fdf 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py
@@ -25,6 +25,14 @@
 from .unweighted import _bfs
 
 __all__ = [
+    "dijkstra_path",
+    "dijkstra_path_length",
+    "single_source_dijkstra",
+    "single_source_dijkstra_path",
+    "single_source_dijkstra_path_length",
+    "all_pairs_dijkstra",
+    "all_pairs_dijkstra_path",
+    "all_pairs_dijkstra_path_length",
     "bellman_ford_path",
     "bellman_ford_path_length",
     "single_source_bellman_ford",
@@ -44,14 +52,24 @@ def _add_doc(func):
     return func
 
 
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def bellman_ford_path(G, source, target, weight="weight", *, dtype=None):
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def dijkstra_path(G, source, target, weight="weight", *, dtype=None):
     G = _to_graph(G, weight, 1, np.float32)
     dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     return _sssp(G, source, weight, target, return_type="path", dtype=dtype)
 
 
+@dijkstra_path._can_run
+def _(G, source, target, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def bellman_ford_path(G, source, target, weight="weight", *, dtype=None):
+    return dijkstra_path(G, source, target, weight=weight, dtype=dtype)
+
+
 @bellman_ford_path._can_run
 def _(G, source, target, weight="weight", *, dtype=None):
     return (
@@ -61,14 +79,24 @@ def _(G, source, target, weight="weight", *, dtype=None):
     )
 
 
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def bellman_ford_path_length(G, source, target, weight="weight", *, dtype=None):
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def dijkstra_path_length(G, source, target, weight="weight", *, dtype=None):
     G = _to_graph(G, weight, 1, np.float32)
     dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     return _sssp(G, source, weight, target, return_type="length", dtype=dtype)
 
 
+@dijkstra_path._can_run
+def _(G, source, target, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def bellman_ford_path_length(G, source, target, weight="weight", *, dtype=None):
+    return dijkstra_path_length(G, source, target, weight=weight, dtype=dtype)
+
+
 @bellman_ford_path_length._can_run
 def _(G, source, target, weight="weight", *, dtype=None):
     return (
@@ -78,12 +106,22 @@ def _(G, source, target, weight="weight", *, dtype=None):
     )
 
 
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def single_source_dijkstra_path(G, source, cutoff=None, weight="weight", *, dtype=None):
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    return _sssp(G, source, weight, return_type="path", dtype=dtype, cutoff=cutoff)
+
+
+@single_source_dijkstra_path._can_run
+def _(G, source, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
 @networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
 @_add_doc
 def single_source_bellman_ford_path(G, source, weight="weight", *, dtype=None):
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    return _sssp(G, source, weight, return_type="path", dtype=dtype)
+    return single_source_dijkstra_path(G, source, weight=weight, dtype=dtype)
 
 
 @single_source_bellman_ford_path._can_run
@@ -95,12 +133,24 @@ def _(G, source, weight="weight", *, dtype=None):
     )
 
 
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def single_source_dijkstra_path_length(
+    G, source, cutoff=None, weight="weight", *, dtype=None
+):
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    return _sssp(G, source, weight, return_type="length", dtype=dtype, cutoff=cutoff)
+
+
+@single_source_dijkstra_path_length._can_run
+def _(G, source, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
 @networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
 @_add_doc
 def single_source_bellman_ford_path_length(G, source, weight="weight", *, dtype=None):
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    return _sssp(G, source, weight, return_type="length", dtype=dtype)
+    return single_source_dijkstra_path_length(G, source, weight=weight, dtype=dtype)
 
 
 @single_source_bellman_ford_path_length._can_run
@@ -112,12 +162,26 @@ def _(G, source, weight="weight", *, dtype=None):
     )
 
 
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def single_source_dijkstra(
+    G, source, target=None, cutoff=None, weight="weight", *, dtype=None
+):
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    return _sssp(
+        G, source, weight, target, return_type="length-path", dtype=dtype, cutoff=cutoff
+    )
+
+
+@single_source_dijkstra._can_run
+def _(G, source, target=None, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
 @networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
 @_add_doc
 def single_source_bellman_ford(G, source, target=None, weight="weight", *, dtype=None):
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    return _sssp(G, source, weight, target, return_type="length-path", dtype=dtype)
+    return single_source_dijkstra(G, source, target=target, weight=weight, dtype=dtype)
 
 
 @single_source_bellman_ford._can_run
@@ -129,14 +193,41 @@ def _(G, source, target=None, weight="weight", *, dtype=None):
     )
 
 
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def all_pairs_bellman_ford_path_length(G, weight="weight", *, dtype=None):
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def all_pairs_dijkstra(G, cutoff=None, weight="weight", *, dtype=None):
+    # TODO PERF: batched bfs to compute many at once
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    for n in G:
+        yield (
+            n,
+            _sssp(G, n, weight, return_type="length-path", dtype=dtype, cutoff=cutoff),
+        )
+
+
+@all_pairs_dijkstra._can_run
+def _(G, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def all_pairs_dijkstra_path_length(G, cutoff=None, weight="weight", *, dtype=None):
     # TODO PERF: batched bfs to compute many at once
     G = _to_graph(G, weight, 1, np.float32)
     dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     for n in G:
-        yield (n, _sssp(G, n, weight, return_type="length", dtype=dtype))
+        yield (n, _sssp(G, n, weight, return_type="length", dtype=dtype, cutoff=cutoff))
+
+
+@all_pairs_dijkstra_path_length._can_run
+def _(G, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def all_pairs_bellman_ford_path_length(G, weight="weight", *, dtype=None):
+    return all_pairs_dijkstra_path_length(G, weight=weight, dtype=None)
 
 
 @all_pairs_bellman_ford_path_length._can_run
@@ -148,14 +239,24 @@ def _(G, weight="weight", *, dtype=None):
     )
 
 
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def all_pairs_bellman_ford_path(G, weight="weight", *, dtype=None):
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def all_pairs_dijkstra_path(G, cutoff=None, weight="weight", *, dtype=None):
     # TODO PERF: batched bfs to compute many at once
     G = _to_graph(G, weight, 1, np.float32)
     dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     for n in G:
-        yield (n, _sssp(G, n, weight, return_type="path", dtype=dtype))
+        yield (n, _sssp(G, n, weight, return_type="path", dtype=dtype, cutoff=cutoff))
+
+
+@all_pairs_dijkstra_path._can_run
+def _(G, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def all_pairs_bellman_ford_path(G, weight="weight", *, dtype=None):
+    return all_pairs_dijkstra_path(G, weight=weight, dtype=None)
 
 
 @all_pairs_bellman_ford_path._can_run
@@ -167,7 +268,17 @@ def _(G, weight="weight", *, dtype=None):
     )
 
 
-def _sssp(G, source, weight, target=None, *, return_type, dtype, reverse_path=False):
+def _sssp(
+    G,
+    source,
+    weight,
+    target=None,
+    *,
+    return_type,
+    dtype,
+    reverse_path=False,
+    cutoff=None,
+):
     """SSSP for weighted shortest paths.
 
     Parameters
@@ -201,7 +312,7 @@ def _sssp(G, source, weight, target=None, *, return_type, dtype, reverse_path=Fa
 
     if weight not in G.edge_values:
         # No edge values, so use BFS instead
-        return _bfs(G, source, None, "Source", return_type=return_type, target=target)
+        return _bfs(G, source, cutoff, "Source", return_type=return_type, target=target)
 
     # Check for negative values since we don't support negative cycles
     edge_vals = G.edge_values[weight]
@@ -217,7 +328,7 @@ def _sssp(G, source, weight, target=None, *, return_type, dtype, reverse_path=Fa
         return _bfs(
             G,
             source,
-            None,
+            None if cutoff is None else cutoff / edge_val,
             "Source",
             return_type=return_type,
             target=target,
@@ -226,11 +337,16 @@ def _sssp(G, source, weight, target=None, *, return_type, dtype, reverse_path=Fa
         )
 
     src_index = source if G.key_to_id is None else G.key_to_id[source]
+    if cutoff is None:
+        cutoff = np.inf
+    else:
+        cutoff = np.nextafter(cutoff, np.inf, dtype=np.float64)
+
     node_ids, distances, predecessors = plc.sssp(
         resource_handle=plc.ResourceHandle(),
         graph=G._get_plc_graph(weight, 1, dtype),
         source=src_index,
-        cutoff=np.inf,
+        cutoff=cutoff,
         compute_predecessors=True,  # TODO: False is not yet supported
         # compute_predecessors=return_type != "length",
         do_expensive_check=False,
diff --git a/python/nx-cugraph/nx_cugraph/convert.py b/python/nx-cugraph/nx_cugraph/convert.py
index b34245d5031..9e6c080d6ef 100644
--- a/python/nx-cugraph/nx_cugraph/convert.py
+++ b/python/nx-cugraph/nx_cugraph/convert.py
@@ -14,7 +14,7 @@
 
 import itertools
 import operator as op
-from collections import Counter
+from collections import Counter, defaultdict
 from collections.abc import Mapping
 from typing import TYPE_CHECKING
 
@@ -24,7 +24,8 @@
 
 import nx_cugraph as nxcg
 
-from .utils import index_dtype
+from .utils import index_dtype, networkx_algorithm
+from .utils.misc import pairwise
 
 if TYPE_CHECKING:  # pragma: no cover
     from nx_cugraph.typing import AttrKey, Dtype, EdgeValue, NodeValue, any_ndarray
@@ -32,6 +33,8 @@
 __all__ = [
     "from_networkx",
     "to_networkx",
+    "from_dict_of_lists",
+    "to_dict_of_lists",
 ]
 
 concat = itertools.chain.from_iterable
@@ -653,3 +656,98 @@ def _to_undirected_graph(
         )
     # TODO: handle cugraph.Graph
     raise TypeError
+
+
+@networkx_algorithm(version_added="24.08")
+def from_dict_of_lists(d, create_using=None):
+    from .generators._utils import _create_using_class
+
+    graph_class, inplace = _create_using_class(create_using)
+    key_to_id = defaultdict(itertools.count().__next__)
+    src_indices = cp.array(
+        # cp.repeat is slow to use here, so use numpy instead
+        np.repeat(
+            np.fromiter(map(key_to_id.__getitem__, d), index_dtype),
+            np.fromiter(map(len, d.values()), index_dtype),
+        )
+    )
+    dst_indices = cp.fromiter(
+        map(key_to_id.__getitem__, concat(d.values())), index_dtype
+    )
+    # Initialize as directed first them symmetrize if undirected.
+    G = graph_class.to_directed_class().from_coo(
+        len(key_to_id),
+        src_indices,
+        dst_indices,
+        key_to_id=key_to_id,
+    )
+    if not graph_class.is_directed():
+        G = G.to_undirected()
+    if inplace:
+        return create_using._become(G)
+    return G
+
+
+@networkx_algorithm(version_added="24.08")
+def to_dict_of_lists(G, nodelist=None):
+    G = _to_graph(G)
+    src_indices = G.src_indices
+    dst_indices = G.dst_indices
+    if nodelist is not None:
+        try:
+            node_ids = G._nodekeys_to_nodearray(nodelist)
+        except KeyError as exc:
+            gname = "digraph" if G.is_directed() else "graph"
+            raise nx.NetworkXError(
+                f"The node {exc.args[0]} is not in the {gname}."
+            ) from exc
+        mask = cp.isin(src_indices, node_ids) & cp.isin(dst_indices, node_ids)
+        src_indices = src_indices[mask]
+        dst_indices = dst_indices[mask]
+    # Sort indices so we can use `cp.unique` to determine boundaries.
+    # This is like exporting to DCSR.
+    if G.is_multigraph():
+        stacked = cp.unique(cp.vstack((src_indices, dst_indices)), axis=1)
+        src_indices = stacked[0]
+        dst_indices = stacked[1]
+    else:
+        stacked = cp.vstack((dst_indices, src_indices))
+        indices = cp.lexsort(stacked)
+        src_indices = src_indices[indices]
+        dst_indices = dst_indices[indices]
+    compressed_srcs, left_bounds = cp.unique(src_indices, return_index=True)
+    # Ensure we include isolate nodes in the result (and in proper order)
+    rv = None
+    if nodelist is not None:
+        if compressed_srcs.size != len(nodelist):
+            if G.key_to_id is None:
+                # `G._nodekeys_to_nodearray` does not check for valid node keys.
+                container = range(G._N)
+                for key in nodelist:
+                    if key not in container:
+                        gname = "digraph" if G.is_directed() else "graph"
+                        raise nx.NetworkXError(f"The node {key} is not in the {gname}.")
+            rv = {key: [] for key in nodelist}
+    elif compressed_srcs.size != G._N:
+        rv = {key: [] for key in G}
+    # We use `boundaries` like this in `_groupby` too
+    boundaries = pairwise(itertools.chain(left_bounds.tolist(), [src_indices.size]))
+    dst_indices = dst_indices.tolist()
+    if G.key_to_id is None:
+        it = zip(compressed_srcs.tolist(), boundaries)
+        if rv is None:
+            return {src: dst_indices[start:end] for src, (start, end) in it}
+        rv.update((src, dst_indices[start:end]) for src, (start, end) in it)
+        return rv
+    to_key = G.id_to_key.__getitem__
+    it = zip(compressed_srcs.tolist(), boundaries)
+    if rv is None:
+        return {
+            to_key(src): list(map(to_key, dst_indices[start:end]))
+            for src, (start, end) in it
+        }
+    rv.update(
+        (to_key(src), list(map(to_key, dst_indices[start:end])))
+        for src, (start, end) in it
+    )
+    return rv
diff --git a/python/nx-cugraph/nx_cugraph/convert_matrix.py b/python/nx-cugraph/nx_cugraph/convert_matrix.py
index 67f6386987b..38139b913cf 100644
--- a/python/nx-cugraph/nx_cugraph/convert_matrix.py
+++ b/python/nx-cugraph/nx_cugraph/convert_matrix.py
@@ -15,7 +15,7 @@
 import numpy as np
 
 from .generators._utils import _create_using_class
-from .utils import index_dtype, networkx_algorithm
+from .utils import _cp_iscopied_asarray, index_dtype, networkx_algorithm
 
 __all__ = [
     "from_pandas_edgelist",
@@ -34,16 +34,30 @@ def from_pandas_edgelist(
     edge_key=None,
 ):
     """cudf.DataFrame inputs also supported; value columns with str is unsuppported."""
+    # This function never shares ownership of the underlying arrays of the DataFrame
+    # columns. We will perform a copy if necessary even if given e.g. a cudf.DataFrame.
     graph_class, inplace = _create_using_class(create_using)
     # Try to be optimal whether using pandas, cudf, or cudf.pandas
-    src_array = df[source].to_numpy()
-    dst_array = df[target].to_numpy()
+    src_series = df[source]
+    dst_series = df[target]
     try:
         # Optimistically try to use cupy, but fall back to numpy if necessary
-        src_array = cp.asarray(src_array)
-        dst_array = cp.asarray(dst_array)
+        src_array = src_series.to_cupy()
+        dst_array = dst_series.to_cupy()
+    except (AttributeError, TypeError, ValueError, NotImplementedError):
+        src_array = src_series.to_numpy()
+        dst_array = dst_series.to_numpy()
+    try:
+        # Minimize unnecessary data copies by tracking whether we copy or not
+        is_src_copied, src_array = _cp_iscopied_asarray(
+            src_array, orig_object=src_series
+        )
+        is_dst_copied, dst_array = _cp_iscopied_asarray(
+            dst_array, orig_object=dst_series
+        )
         np_or_cp = cp
     except ValueError:
+        is_src_copied = is_dst_copied = False
         src_array = np.asarray(src_array)
         dst_array = np.asarray(dst_array)
         np_or_cp = np
@@ -65,8 +79,15 @@ def from_pandas_edgelist(
         src_indices = cp.asarray(np_or_cp.searchsorted(nodes, src_array), index_dtype)
         dst_indices = cp.asarray(np_or_cp.searchsorted(nodes, dst_array), index_dtype)
     else:
-        src_indices = cp.array(src_array)
-        dst_indices = cp.array(dst_array)
+        # Copy if necessary so we don't share ownership of input arrays.
+        if is_src_copied:
+            src_indices = src_array
+        else:
+            src_indices = cp.array(src_array)
+        if is_dst_copied:
+            dst_indices = dst_array
+        else:
+            dst_indices = cp.array(dst_array)
 
     if not graph_class.is_directed():
         # Symmetrize the edges
@@ -111,19 +132,28 @@ def from_pandas_edgelist(
             }
         kwargs["edge_values"] = edge_values
 
-        if graph_class.is_multigraph() and edge_key is not None:
-            try:
-                edge_keys = df[edge_key].to_list()
-            except (KeyError, TypeError) as exc:
-                raise nx.NetworkXError(
-                    f"Invalid edge_key argument: {edge_key}"
-                ) from exc
-            if not graph_class.is_directed():
-                # Symmetrize the edges
-                edge_keys = cp.hstack(
-                    (edge_keys, edge_keys[mask] if mask is not None else edge_keys)
-                )
-            kwargs["edge_keys"] = edge_keys
+    if (
+        graph_class.is_multigraph()
+        and edge_key is not None
+        and (
+            # In nx <= 3.3, `edge_key` was ignored if `edge_attr` is None
+            edge_attr is not None
+            or nx.__version__[:3] > "3.3"
+        )
+    ):
+        try:
+            edge_keys = df[edge_key].to_list()
+        except (KeyError, TypeError) as exc:
+            raise nx.NetworkXError(f"Invalid edge_key argument: {edge_key}") from exc
+        if not graph_class.is_directed():
+            # Symmetrize the edges; remember, `edge_keys` is a list!
+            if mask is None:
+                edge_keys *= 2
+            else:
+                edge_keys += [
+                    key for keep, key in zip(mask.tolist(), edge_keys) if keep
+                ]
+        kwargs["edge_keys"] = edge_keys
 
     G = graph_class.from_coo(N, src_indices, dst_indices, **kwargs)
     if inplace:
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_convert.py b/python/nx-cugraph/nx_cugraph/tests/test_convert.py
index 1a71b796861..634b28e961c 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_convert.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_convert.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,10 +13,13 @@
 import cupy as cp
 import networkx as nx
 import pytest
+from packaging.version import parse
 
 import nx_cugraph as nxcg
 from nx_cugraph import interface
 
+nxver = parse(nx.__version__)
+
 
 @pytest.mark.parametrize(
     "graph_class", [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
@@ -224,3 +227,48 @@ def test_multigraph(graph_class):
     H = nxcg.to_networkx(Gcg)
     assert type(G) is type(H)
     assert nx.utils.graphs_equal(G, H)
+
+
+def test_to_dict_of_lists():
+    G = nx.MultiGraph()
+    G.add_edge("a", "b")
+    G.add_edge("a", "c")
+    G.add_edge("a", "b")
+    expected = nx.to_dict_of_lists(G)
+    result = nxcg.to_dict_of_lists(G)
+    assert expected == result
+    expected = nx.to_dict_of_lists(G, nodelist=["a", "b"])
+    result = nxcg.to_dict_of_lists(G, nodelist=["a", "b"])
+    assert expected == result
+    with pytest.raises(nx.NetworkXError, match="The node d is not in the graph"):
+        nx.to_dict_of_lists(G, nodelist=["a", "d"])
+    with pytest.raises(nx.NetworkXError, match="The node d is not in the graph"):
+        nxcg.to_dict_of_lists(G, nodelist=["a", "d"])
+    G.add_node("d")  # No edges
+    expected = nx.to_dict_of_lists(G)
+    result = nxcg.to_dict_of_lists(G)
+    assert expected == result
+    expected = nx.to_dict_of_lists(G, nodelist=["a", "d"])
+    result = nxcg.to_dict_of_lists(G, nodelist=["a", "d"])
+    assert expected == result
+    # Now try with default node ids
+    G = nx.DiGraph()
+    G.add_edge(0, 1)
+    G.add_edge(0, 2)
+    expected = nx.to_dict_of_lists(G)
+    result = nxcg.to_dict_of_lists(G)
+    assert expected == result
+    expected = nx.to_dict_of_lists(G, nodelist=[0, 1])
+    result = nxcg.to_dict_of_lists(G, nodelist=[0, 1])
+    assert expected == result
+    with pytest.raises(nx.NetworkXError, match="The node 3 is not in the digraph"):
+        nx.to_dict_of_lists(G, nodelist=[0, 3])
+    with pytest.raises(nx.NetworkXError, match="The node 3 is not in the digraph"):
+        nxcg.to_dict_of_lists(G, nodelist=[0, 3])
+    G.add_node(3)  # No edges
+    expected = nx.to_dict_of_lists(G)
+    result = nxcg.to_dict_of_lists(G)
+    assert expected == result
+    expected = nx.to_dict_of_lists(G, nodelist=[0, 3])
+    result = nxcg.to_dict_of_lists(G, nodelist=[0, 3])
+    assert expected == result
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py b/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py
new file mode 100644
index 00000000000..0a9cc087ce0
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import pandas as pd
+import pytest
+
+import nx_cugraph as nxcg
+from nx_cugraph.utils import _cp_iscopied_asarray
+
+try:
+    import cudf
+except ModuleNotFoundError:
+    cudf = None
+
+
+DATA = [
+    {"source": [0, 1], "target": [1, 2]},  # nodes are 0, 1, 2
+    {"source": [0, 1], "target": [1, 3]},  # nodes are 0, 1, 3 (need renumbered!)
+    {"source": ["a", "b"], "target": ["b", "c"]},  # nodes are 'a', 'b', 'c'
+]
+CREATE_USING = [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
+
+
+@pytest.mark.skipif("not cudf")
+@pytest.mark.parametrize("data", DATA)
+@pytest.mark.parametrize("create_using", CREATE_USING)
+def test_from_cudf_edgelist(data, create_using):
+    df = cudf.DataFrame(data)
+    nxcg.from_pandas_edgelist(df, create_using=create_using)  # Basic smoke test
+    source = df["source"]
+    if source.dtype == int:
+        is_copied, src_array = _cp_iscopied_asarray(source)
+        assert is_copied is False
+        is_copied, src_array = _cp_iscopied_asarray(source.to_cupy())
+        assert is_copied is False
+        is_copied, src_array = _cp_iscopied_asarray(source, orig_object=source)
+        assert is_copied is False
+        is_copied, src_array = _cp_iscopied_asarray(
+            source.to_cupy(), orig_object=source
+        )
+        assert is_copied is False
+        # to numpy
+        is_copied, src_array = _cp_iscopied_asarray(source.to_numpy())
+        assert is_copied is True
+        is_copied, src_array = _cp_iscopied_asarray(
+            source.to_numpy(), orig_object=source
+        )
+        assert is_copied is True
+    else:
+        with pytest.raises(TypeError):
+            _cp_iscopied_asarray(source)
+        with pytest.raises(TypeError):
+            _cp_iscopied_asarray(source.to_cupy())
+        with pytest.raises(ValueError, match="Unsupported dtype"):
+            _cp_iscopied_asarray(source.to_numpy())
+        with pytest.raises(ValueError, match="Unsupported dtype"):
+            _cp_iscopied_asarray(source.to_numpy(), orig_object=source)
+
+
+@pytest.mark.parametrize("data", DATA)
+@pytest.mark.parametrize("create_using", CREATE_USING)
+def test_from_pandas_edgelist(data, create_using):
+    df = pd.DataFrame(data)
+    nxcg.from_pandas_edgelist(df, create_using=create_using)  # Basic smoke test
+    source = df["source"]
+    if source.dtype == int:
+        is_copied, src_array = _cp_iscopied_asarray(source)
+        assert is_copied is True
+        is_copied, src_array = _cp_iscopied_asarray(source, orig_object=source)
+        assert is_copied is True
+        is_copied, src_array = _cp_iscopied_asarray(source.to_numpy())
+        assert is_copied is True
+        is_copied, src_array = _cp_iscopied_asarray(
+            source.to_numpy(), orig_object=source
+        )
+        assert is_copied is True
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_match_api.py b/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
index d784d8c13cb..176b531a6e7 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
@@ -48,7 +48,7 @@ def test_match_signature_and_names():
         orig_sig = inspect.signature(orig_func)
         func_sig = inspect.signature(func)
         if not func.extra_params:
-            assert orig_sig == func_sig
+            assert orig_sig == func_sig, name
         else:
             # Ignore extra parameters added to nx-cugraph algorithm
             # The key of func.extra_params may be like "max_level : int, optional",
@@ -60,14 +60,14 @@ def test_match_signature_and_names():
                     for name, p in func_sig.parameters.items()
                     if name not in extra_params
                 ]
-            )
+            ), name
         if func.can_run is not nxcg.utils.decorators._default_can_run:
-            assert func_sig == inspect.signature(func.can_run)
+            assert func_sig == inspect.signature(func.can_run), name
         if func.should_run is not nxcg.utils.decorators._default_should_run:
-            assert func_sig == inspect.signature(func.should_run)
+            assert func_sig == inspect.signature(func.should_run), name
 
         # Matching function names?
-        assert func.__name__ == dispatchable_func.__name__ == orig_func.__name__
+        assert func.__name__ == dispatchable_func.__name__ == orig_func.__name__, name
 
         # Matching dispatch names?
         # nx version >=3.2 uses name, version >=3.0,<3.2 uses dispatchname
@@ -75,14 +75,14 @@ def test_match_signature_and_names():
             dispatchname = dispatchable_func.dispatchname
         else:
             dispatchname = dispatchable_func.name
-        assert func.name == dispatchname
+        assert func.name == dispatchname, name
 
         # Matching modules (i.e., where function defined)?
         assert (
             "networkx." + func.__module__.split(".", 1)[1]
             == dispatchable_func.__module__
             == orig_func.__module__
-        )
+        ), name
 
         # Matching package layout (i.e., which modules have the function)?
         nxcg_path = func.__module__
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_utils.py b/python/nx-cugraph/nx_cugraph/tests/test_utils.py
index fdd0c91995c..d38a286fa5d 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_utils.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,10 +10,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import cupy as cp
 import numpy as np
 import pytest
 
-from nx_cugraph.utils import _get_int_dtype
+from nx_cugraph.utils import _cp_iscopied_asarray, _get_int_dtype
 
 
 def test_get_int_dtype():
@@ -85,3 +86,20 @@ def test_get_int_dtype():
         _get_int_dtype(7, signed=True, unsigned=True)
     assert _get_int_dtype(7, signed=True, unsigned=False) == np.int8
     assert _get_int_dtype(7, signed=False, unsigned=True) == np.uint8
+
+
+def test_cp_iscopied_asarray():
+    # We don't yet run doctest, so do simple copy/paste test here.
+    #
+    # >>> is_copied, a = _cp_iscopied_asarray([1, 2, 3])
+    # >>> is_copied
+    # True
+    # >>> a
+    # array([1, 2, 3])
+    # >>> _cp_iscopied_asarray(a)
+    # (False, array([1, 2, 3]))
+    is_copied, a = _cp_iscopied_asarray([1, 2, 3])
+    assert is_copied is True
+    assert isinstance(a, cp.ndarray)
+    assert repr(a) == "array([1, 2, 3])"
+    assert _cp_iscopied_asarray(a)[0] is False
diff --git a/python/nx-cugraph/nx_cugraph/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py
index eab4b42c2cc..8526524f1de 100644
--- a/python/nx-cugraph/nx_cugraph/utils/misc.py
+++ b/python/nx-cugraph/nx_cugraph/utils/misc.py
@@ -45,6 +45,7 @@ def pairwise(it):
     "_get_int_dtype",
     "_get_float_dtype",
     "_dtype_param",
+    "_cp_iscopied_asarray",
 ]
 
 # This may switch to np.uint32 at some point
@@ -206,3 +207,34 @@ def _get_float_dtype(
             f"Dtype {dtype} cannot be safely promoted to float32 or float64"
         )
     return rv
+
+
+def _cp_iscopied_asarray(a, *args, orig_object=None, **kwargs):
+    """Like ``cp.asarray``, but also returns whether the input was copied.
+
+    Use this to avoid unnecessary copies. If given, ``orig_object`` will
+    also be inspected to determine if it was copied.
+
+    >>> is_copied, a = _cp_iscopied_asarray([1, 2, 3])
+    >>> is_copied
+    True
+    >>> a
+    array([1, 2, 3])
+    >>> _cp_iscopied_asarray(a)
+    (False, array([1, 2, 3]))
+    """
+    arr = cp.asarray(a, *args, **kwargs)
+    ptr = arr.__cuda_array_interface__["data"][0]
+    if (
+        hasattr(a, "__cuda_array_interface__")
+        and a.__cuda_array_interface__["data"][0] == ptr
+        and (
+            orig_object is None
+            or hasattr(orig_object, "__cuda_array_interface__")
+            and orig_object.__cuda_array_interface__["data"][0] == ptr
+        )
+        # Should we also check device_id?
+        # and getattr(getattr(a, "data", None), "device_id", None) == arr.data.device_id
+    ):
+        return False, arr
+    return True, arr
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index 50881d5db90..07e09201c92 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -86,6 +86,7 @@ include = [
 build-backend = "setuptools.build_meta"
 commit-files = ["_nx_cugraph/GIT_COMMIT"]
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
 
 [tool.black]
 line-length = 88
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 984e1d140f2..5f13f45f99a 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -72,3 +72,4 @@ requires = [
     "pylibraft==24.8.*,>=0.0.0a0",
     "rmm==24.8.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+matrix-entry = "cuda_suffixed=true"