Pull previous changes for reduce by key

NVIDIA · Aug 2, 2022 · b200436 · b200436
1 parent 83cdc5d
commit b200436
Show file tree

Hide file tree

Showing 2 changed files with 93 additions and 64 deletions.
diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -38,6 +38,7 @@
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/grid/grid_queue.cuh>
 #include <cub/thread/thread_operators.cuh>
+#include <cub/util_deprecated.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
 
@@ -372,11 +373,6 @@ struct DispatchReduceByKey
    * @param[in] stream
    *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
    *
-   * @param[in] debug_synchronous
-   *   Whether or not to synchronize the stream after every kernel launch to
-   *   check for errors. Also causes launch configurations to be printed to
-   *   the console. Default is `false`.
-   *
    * @param[in] ptx_version
    *   PTX version of dispatch kernels
    *
@@ -405,35 +401,11 @@ struct DispatchReduceByKey
            ReductionOpT reduction_op,
            OffsetT num_items,
            cudaStream_t stream,
-           bool debug_synchronous,
            int /*ptx_version*/,
            ScanInitKernelT init_kernel,
            ReduceByKeyKernelT reduce_by_key_kernel,
            KernelConfig reduce_by_key_config)
   {
-
-#ifndef CUB_RUNTIME_ENABLED
-    (void)d_temp_storage;
-    (void)temp_storage_bytes;
-    (void)d_keys_in;
-    (void)d_unique_out;
-    (void)d_values_in;
-    (void)d_aggregates_out;
-    (void)d_num_runs_out;
-    (void)equality_op;
-    (void)reduction_op;
-    (void)num_items;
-    (void)stream;
-    (void)debug_synchronous;
-    (void)init_kernel;
-    (void)reduce_by_key_kernel;
-    (void)reduce_by_key_config;
-
-    // Kernel launch not supported from this device
-    return CubDebug(cudaErrorNotSupported);
-
-#else
-
     cudaError error = cudaSuccess;
     do
     {
@@ -488,13 +460,13 @@ struct DispatchReduceByKey
       // Log init_kernel configuration
       int init_grid_size =
         CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
-      if (debug_synchronous)
-      {
-        _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n",
-                init_grid_size,
-                INIT_KERNEL_THREADS,
-                (long long)stream);
-      }
+
+      #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n",
+              init_grid_size,
+              INIT_KERNEL_THREADS,
+              (long long)stream);
+      #endif
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -511,7 +483,8 @@ struct DispatchReduceByKey
       }
 
       // Sync the stream if specified to flush runtime errors
-      if (debug_synchronous && (CubDebug(error = SyncStream(stream))))
+      error = detail::DebugSyncStream(stream);
+      if (CubDebug(error))
       {
         break;
       }
@@ -546,17 +519,16 @@ struct DispatchReduceByKey
            start_tile += scan_grid_size)
       {
         // Log reduce_by_key_kernel configuration
-        if (debug_synchronous)
-        {
-          _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d "
-                  "items per thread, %d SM occupancy\n",
-                  start_tile,
-                  scan_grid_size,
-                  reduce_by_key_config.block_threads,
-                  (long long)stream,
-                  reduce_by_key_config.items_per_thread,
-                  reduce_by_key_sm_occupancy);
-        }
+        #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d "
+                "items per thread, %d SM occupancy\n",
+                start_tile,
+                scan_grid_size,
+                reduce_by_key_config.block_threads,
+                (long long)stream,
+                reduce_by_key_config.items_per_thread,
+                reduce_by_key_sm_occupancy);
+        #endif
 
         // Invoke reduce_by_key_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -583,16 +555,54 @@ struct DispatchReduceByKey
         }
 
         // Sync the stream if specified to flush runtime errors
-        if (debug_synchronous && (CubDebug(error = SyncStream(stream))))
+        error = detail::DebugSyncStream(stream);
+        if (CubDebug(error))
         {
           break;
         }
       }
     } while (0);
 
     return error;
+  }
 
-#endif // CUB_RUNTIME_ENABLED
+  template <typename ScanInitKernelT, typename ReduceByKeyKernelT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  Dispatch(void *d_temp_storage,
+           size_t &temp_storage_bytes,
+           KeysInputIteratorT d_keys_in,
+           UniqueOutputIteratorT d_unique_out,
+           ValuesInputIteratorT d_values_in,
+           AggregatesOutputIteratorT d_aggregates_out,
+           NumRunsOutputIteratorT d_num_runs_out,
+           EqualityOpT equality_op,
+           ReductionOpT reduction_op,
+           OffsetT num_items,
+           cudaStream_t stream,
+           bool debug_synchronous,
+           int ptx_version,
+           ScanInitKernelT init_kernel,
+           ReduceByKeyKernelT reduce_by_key_kernel,
+           KernelConfig reduce_by_key_config)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch<ScanInitKernelT, ReduceByKeyKernelT>(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys_in,
+                                                         d_unique_out,
+                                                         d_values_in,
+                                                         d_aggregates_out,
+                                                         d_num_runs_out,
+                                                         equality_op,
+                                                         reduction_op,
+                                                         num_items,
+                                                         stream,
+                                                         ptx_version,
+                                                         init_kernel,
+                                                         reduce_by_key_kernel,
+                                                         reduce_by_key_config);
   }
 
   /**
@@ -633,11 +643,6 @@ struct DispatchReduceByKey
    *
    * @param[in] stream
    *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
-   *
-   * @param[in] debug_synchronous
-   *   Whether or not to synchronize the stream after every kernel launch to
-   *   check for errors. Also causes launch configurations to be printed to
-   *   the console. Default is `false`.
    */
   CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
   Dispatch(void *d_temp_storage,
@@ -650,8 +655,7 @@ struct DispatchReduceByKey
            EqualityOpT equality_op,
            ReductionOpT reduction_op,
            OffsetT num_items,
-           cudaStream_t stream,
-           bool debug_synchronous)
+           cudaStream_t stream)
   {
     cudaError error = cudaSuccess;
 
@@ -682,7 +686,6 @@ struct DispatchReduceByKey
               reduction_op,
               num_items,
               stream,
-              debug_synchronous,
               ptx_version,
               DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
               DeviceReduceByKeyKernel<PtxReduceByKeyPolicy,
@@ -704,6 +707,36 @@ struct DispatchReduceByKey
 
     return error;
   }
+
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  Dispatch(void *d_temp_storage,
+           size_t &temp_storage_bytes,
+           KeysInputIteratorT d_keys_in,
+           UniqueOutputIteratorT d_unique_out,
+           ValuesInputIteratorT d_values_in,
+           AggregatesOutputIteratorT d_aggregates_out,
+           NumRunsOutputIteratorT d_num_runs_out,
+           EqualityOpT equality_op,
+           ReductionOpT reduction_op,
+           OffsetT num_items,
+           cudaStream_t stream,
+           bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(d_temp_storage,
+                    temp_storage_bytes,
+                    d_keys_in,
+                    d_unique_out,
+                    d_values_in,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    equality_op,
+                    reduction_op,
+                    num_items,
+                    stream);
+  }
 };
 
 CUB_NAMESPACE_END

diff --git a/test/test_device_scan.cu b/test/test_device_scan.cu
@@ -1152,9 +1152,7 @@ void TestAccumulatorTypes()
                                               d_out,
                                               cub::Sum{},
                                               init,
-                                              num_items,
-                                              0,
-                                              true));
+                                              num_items));
 
   CubDebugExit(
     g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
@@ -1166,9 +1164,7 @@ void TestAccumulatorTypes()
                                               d_out,
                                               cub::Sum{},
                                               init,
-                                              num_items,
-                                              0,
-                                              true));
+                                              num_items));
 
   int ok{};
   CubDebugExit(cudaMemcpy(&ok, d_ok_count, sizeof(int), cudaMemcpyDeviceToHost));