NVIDIA · PointKernel · Jun 13, 2022 · Jun 4, 2022 · Jun 4, 2022 · Jun 7, 2022
@@ -37,8 +37,6 @@ conda activate cuda
 
 gpuci_logger "Check versions"
 python --version
-$CC --version
-$CXX --version
 
 gpuci_logger "Check conda environment"
 conda info

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 namespace cuco {
 
+using hash_value_type = uint32_t;
+
 namespace detail {
 
 // MurmurHash3_32 implementation from

@@ -21,6 +21,8 @@
 
 #include <cuda/std/atomic>
 
+#include <cooperative_groups.h>
+
 namespace cuco {
 namespace detail {
 
@@ -186,13 +188,15 @@ class linear_probing_impl
    *
    * If vector-load is enabled, the return slot is always even to avoid illegal memory access.
    *
-   * @tparam CG CUDA Cooperative Groups type
+   * @tparam ProbeKey Probe key type
+   *
    * @param g the Cooperative Group for which the initial slot is needed
    * @param k The key to get the slot for
    * @return Pointer to the initial slot for `k`
    */
-  template <typename CG>
-  __device__ __forceinline__ iterator initial_slot(CG const& g, Key const k) noexcept
+  template <typename ProbeKey>
+  __device__ __forceinline__ iterator
+  initial_slot(cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) noexcept
   {
     auto const hash_value = [&]() {
       auto const tmp = hash_(k);
@@ -307,13 +311,15 @@ class double_hashing_impl
    * If vector-load is enabled, the return slot is always a multiple of (`cg_size` * `vector_width`)
    * to avoid illegal memory access.
    *
-   * @tparam CG CUDA Cooperative Groups type
+   * @tparam ProbeKey Probe key type
+   *
    * @param g the Cooperative Group for which the initial slot is needed
    * @param k The key to get the slot for
    * @return Pointer to the initial slot for `k`
    */
-  template <typename CG>
-  __device__ __forceinline__ iterator initial_slot(CG const& g, Key const k) noexcept
+  template <typename ProbeKey>
+  __device__ __forceinline__ iterator
+  initial_slot(cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) noexcept
   {
     std::size_t index;
     auto const hash_value = hash1_(k);

@@ -715,9 +715,9 @@ static_map<Key, Value, Scope, Allocator>::device_view::find(CG g,
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename Hash, typename KeyEqual>
+template <typename ProbeKey, typename Hash, typename KeyEqual>
 __device__ bool static_map<Key, Value, Scope, Allocator>::device_view::contains(
-  Key const& k, Hash hash, KeyEqual key_equal) const noexcept
+  ProbeKey const& k, Hash hash, KeyEqual key_equal) const noexcept
 {
   auto current_slot = initial_slot(k, hash);
 
@@ -733,9 +733,12 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_view::contains(
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename CG, typename Hash, typename KeyEqual>
-__device__ bool static_map<Key, Value, Scope, Allocator>::device_view::contains(
-  CG g, Key const& k, Hash hash, KeyEqual key_equal) const noexcept
+template <typename CG, typename ProbeKey, typename Hash, typename KeyEqual>
+__device__ std::enable_if_t<std::is_invocable_v<KeyEqual, ProbeKey, Key>, bool>
+static_map<Key, Value, Scope, Allocator>::device_view::contains(CG const& g,
+                                                                ProbeKey const& k,
+                                                                Hash hash,
+                                                                KeyEqual key_equal) const noexcept
 {
   auto current_slot = initial_slot(g, k, hash);
 

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,15 @@
  */
 
 #include <cuco/detail/bitwise_compare.cuh>
+#include <cuco/detail/static_multimap/kernels.cuh>
 #include <cuco/detail/utils.cuh>
 
 #include <thrust/tuple.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
-namespace cuco {
+#include <cooperative_groups.h>
 
+namespace cuco {
 template <typename Key,
           typename Value,
           cuda::thread_scope Scope,
@@ -69,13 +71,16 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    *
    * To be used for Cooperative Group based probing.
    *
-   * @tparam CG Cooperative Group type
+   * @tparam ProbeKey Probe key type
+   *
    * @param g the Cooperative Group for which the initial slot is needed
    * @param k The key to get the slot for
    * @return Pointer to the initial slot for `k`
    */
-  template <typename CG>
-  __device__ __forceinline__ iterator initial_slot(CG const& g, Key const& k) noexcept
+  template <typename ProbeKey>
+  __device__ __forceinline__ iterator
+  initial_slot(cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+               ProbeKey const& k) noexcept
   {
     return probe_sequence_.initial_slot(g, k);
   }
@@ -85,13 +90,16 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    *
    * To be used for Cooperative Group based probing.
    *
-   * @tparam CG Cooperative Group type
+   * @tparam ProbeKey Probe key type
+   *
    * @param g the Cooperative Group for which the initial slot is needed
    * @param k The key to get the slot for
    * @return Pointer to the initial slot for `k`
    */
-  template <typename CG>
-  __device__ __forceinline__ const_iterator initial_slot(CG g, Key const& k) const noexcept
+  template <typename ProbeKey>
+  __device__ __forceinline__ const_iterator
+  initial_slot(cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+               ProbeKey const& k) const noexcept
   {
     return probe_sequence_.initial_slot(g, k);
   }
@@ -568,18 +576,21 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    * `contains` at moderate to high load factors.
    *
    * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam CG Cooperative Group type
+   * @tparam ProbeKey Probe key type
    * @tparam KeyEqual Binary callable type
+   *
    * @param g The Cooperative Group used to perform the contains operation
    * @param k The key to search for
    * @param key_equal The binary callable used to compare two keys
    * for equality
    * @return A boolean indicating whether the key/value pair
    * containing `k` was inserted
    */
-  template <bool uses_vector_load, typename CG, typename KeyEqual>
+  template <bool uses_vector_load, typename ProbeKey, typename KeyEqual>
   __device__ __forceinline__ std::enable_if_t<uses_vector_load, bool> contains(
-    CG g, Key const& k, KeyEqual key_equal) noexcept
+    cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+    ProbeKey const& k,
+    KeyEqual key_equal) noexcept
   {
     auto current_slot = initial_slot(g, k);
 
@@ -616,18 +627,21 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    * `contains` at moderate to high load factors.
    *
    * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam CG Cooperative Group type
+   * @tparam ProbeKey Probe key type
    * @tparam KeyEqual Binary callable type
+   *
    * @param g The Cooperative Group used to perform the contains operation
    * @param k The key to search for
    * @param key_equal The binary callable used to compare two keys
    * for equality
    * @return A boolean indicating whether the key/value pair
    * containing `k` was inserted
    */
-  template <bool uses_vector_load, typename CG, typename KeyEqual>
+  template <bool uses_vector_load, typename ProbeKey, typename KeyEqual>
   __device__ __forceinline__ std::enable_if_t<not uses_vector_load, bool> contains(
-    CG g, Key const& k, KeyEqual key_equal) noexcept
+    cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+    ProbeKey const& k,
+    KeyEqual key_equal) noexcept
   {
     auto current_slot = initial_slot(g, k);
 

@@ -108,7 +108,7 @@ template <typename Key,
           class ProbeSequence>
 template <typename InputIt, typename OutputIt, typename KeyEqual>
 void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::contains(
-  InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream, KeyEqual key_equal) const
+  InputIt first, InputIt last, OutputIt output_begin, KeyEqual key_equal, cudaStream_t stream) const
 {
   auto const num_keys = std::distance(first, last);
   if (num_keys == 0) { return; }
@@ -536,11 +536,11 @@ template <typename Key,
           cuda::thread_scope Scope,
           typename Allocator,
           class ProbeSequence>
-template <typename KeyEqual>
+template <typename ProbeKey, typename KeyEqual>
 __device__ __forceinline__ bool
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::contains(
   cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
-  Key const& k,
+  ProbeKey const& k,
   KeyEqual key_equal) noexcept
 {
   return impl_.contains<uses_vector_load()>(g, k, key_equal);