From d825c47b98e068682c8c848ade67b4f079456624 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Tue, 17 May 2022 18:26:35 -0400 Subject: [PATCH] Introduce `DebugSyncStream` for `debug_synchronous` syncs. `SyncStream` should continue to be used for required synchronizations. `DebugSyncStream` should only be used for sync that are non-essential and only used to handle the `debug_synchronous` flag of the device algorithms. --- .../dispatch/dispatch_adjacent_difference.cuh | 17 ++-- cub/device/dispatch/dispatch_histogram.cuh | 12 ++- cub/device/dispatch/dispatch_merge_sort.cuh | 24 ++---- cub/device/dispatch/dispatch_radix_sort.cuh | 85 +++++++++++++------ cub/device/dispatch/dispatch_reduce.cuh | 44 ++++++++-- .../dispatch/dispatch_reduce_by_key.cuh | 24 +++++- cub/device/dispatch/dispatch_rle.cuh | 25 ++++-- cub/device/dispatch/dispatch_scan.cuh | 23 +++-- cub/device/dispatch/dispatch_scan_by_key.cuh | 22 ++++- .../dispatch/dispatch_segmented_sort.cuh | 33 +++---- cub/device/dispatch/dispatch_select_if.cuh | 22 ++++- cub/device/dispatch/dispatch_spmv_orig.cuh | 29 +++++-- .../dispatch/dispatch_three_way_partition.cuh | 16 ++-- .../dispatch/dispatch_unique_by_key.cuh | 12 ++- cub/util_device.cuh | 38 +++++++++ 15 files changed, 304 insertions(+), 122 deletions(-) diff --git a/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/device/dispatch/dispatch_adjacent_difference.cuh index 676ae6bfaf..60f37b2adb 100644 --- a/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -259,12 +259,10 @@ struct DispatchAdjacentDifference : public SelectedPolicy num_tiles, tile_size); - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // Check for failure to launch @@ -302,12 +300,11 @@ struct DispatchAdjacentDifference : public SelectedPolicy difference_op, num_items); - if (debug_synchronous) + + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // Check for failure to launch diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh index 6781a26e87..1fae8638d0 100644 --- a/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/device/dispatch/dispatch_histogram.cuh @@ -667,11 +667,17 @@ public: tile_queue); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); diff --git a/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/device/dispatch/dispatch_merge_sort.cuh index 0b480899c5..86f1f8cd0e 100644 --- a/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/device/dispatch/dispatch_merge_sort.cuh @@ -704,12 +704,10 @@ struct DispatchMergeSort : SelectedPolicy block_sort_launcher.launch(); - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // Check for failure to launch @@ -769,12 +767,10 @@ struct DispatchMergeSort : SelectedPolicy target_merged_tiles_number, tile_size); - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // Check for failure to launch @@ -786,12 +782,10 @@ struct DispatchMergeSort : SelectedPolicy // Merge merge_launcher.launch(ping, target_merged_tiles_number); - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // Check for failure to launch diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 9e4e15f3f0..fa189ba7b8 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1073,10 +1073,17 @@ struct DispatchRadixSort : end_bit); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Update selector d_keys.selector ^= 1; @@ -1134,10 +1141,17 @@ struct DispatchRadixSort : pass_config.even_share); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Log scan_kernel configuration if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", @@ -1151,10 +1165,17 @@ struct DispatchRadixSort : pass_spine_length); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Log downsweep_kernel configuration if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", @@ -1177,10 +1198,17 @@ struct DispatchRadixSort : pass_config.even_share); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Update current bit current_bit += pass_bits; @@ -1345,14 +1373,13 @@ struct DispatchRadixSort : { break; } - if (debug_synchronous) + + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } - + // exclusive sums to determine starts const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS; @@ -1370,14 +1397,13 @@ struct DispatchRadixSort : d_bins); if (CubDebug(error)) { - break; + break; } - if (debug_synchronous) + + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // use the other buffer if no overwrite is allowed @@ -1435,12 +1461,10 @@ struct DispatchRadixSort : break; } - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } } @@ -1827,10 +1851,17 @@ struct DispatchSegmentedRadixSort : current_bit, pass_bits); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Update current bit current_bit += pass_bits; diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh index 68aa7949bf..ff2adf2005 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh @@ -414,10 +414,17 @@ struct DispatchReduce : init); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); @@ -501,10 +508,17 @@ struct DispatchReduce : reduction_op); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Log single_reduce_sweep_kernel configuration if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", @@ -523,10 +537,17 @@ struct DispatchReduce : init); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); @@ -734,10 +755,17 @@ struct DispatchSegmentedReduce : init); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh index 30f823c865..e292abbc56 100644 --- a/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -301,14 +301,23 @@ struct DispatchReduceByKey d_num_runs_out); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Return if empty problem if (num_items == 0) + { break; + } // Get SM occupancy for reduce_by_key_kernel int reduce_by_key_sm_occupancy; @@ -346,10 +355,17 @@ struct DispatchReduceByKey num_items); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } } while (0); diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh index 4e983bba70..35ab8aa0fb 100644 --- a/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/device/dispatch/dispatch_rle.cuh @@ -302,14 +302,23 @@ struct DeviceRleDispatch d_num_runs_out); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Return if empty problem if (num_items == 0) + { break; + } // Get SM occupancy for device_rle_sweep_kernel int device_rle_kernel_sm_occupancy; @@ -346,11 +355,17 @@ struct DeviceRleDispatch num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh index a39159494b..f59097fcd9 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh @@ -329,11 +329,17 @@ struct DispatchScan: num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Get SM occupancy for scan_kernel int scan_sm_occupancy; @@ -367,10 +373,17 @@ struct DispatchScan: num_items); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } } while (0); diff --git a/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/device/dispatch/dispatch_scan_by_key.cuh index 9a1d0f215c..3557f19a16 100644 --- a/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -330,10 +330,17 @@ struct DispatchScanByKey: ).doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, tile_size, num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Get SM occupancy for scan_kernel int scan_sm_occupancy; @@ -371,10 +378,17 @@ struct DispatchScanByKey: num_items); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } } while (0); diff --git a/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/device/dispatch/dispatch_segmented_sort.cuh index 6e190cdd56..becd887a30 100644 --- a/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -605,12 +605,10 @@ DeviceSegmentedSortContinuation( } // Sync the stream if specified to flush runtime errors - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - return error; - } + return error; } } @@ -666,12 +664,10 @@ DeviceSegmentedSortContinuation( } // Sync the stream if specified to flush runtime errors - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - return error; - } + return error; } } @@ -1632,13 +1628,12 @@ private: return error; \ } \ \ - if (debug_synchronous) \ + error = detail::DebugSyncStream(stream, debug_synchronous); \ + if (CubDebug(error)) \ { \ - if (CubDebug(error = SyncStream(stream))) \ - { \ - return error; \ - } \ + return error; \ } + #endif // CUB_RDC_ENABLED // Clang format mangles some of this NV_IF_TARGET block @@ -1737,12 +1732,10 @@ private: } // Sync the stream if specified to flush runtime errors - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - return error; - } + return error; } return error; diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh index 3504c5812c..15e3e09a0d 100644 --- a/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/device/dispatch/dispatch_select_if.cuh @@ -295,10 +295,17 @@ struct DispatchSelectIf d_num_selected_out); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Return if empty problem if (num_items == 0) @@ -352,10 +359,17 @@ struct DispatchSelectIf num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh index 7f6a29dd95..2c88059eec 100644 --- a/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -525,10 +525,17 @@ struct DispatchSpmv spmv_params); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } break; } @@ -633,7 +640,11 @@ struct DispatchSpmv if (CubDebug(error = cudaPeekAtLastError())) break; // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } // Log spmv_kernel configuration @@ -655,7 +666,11 @@ struct DispatchSpmv if (CubDebug(error = cudaPeekAtLastError())) break; // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Run reduce-by-key fixup if necessary if (num_merge_tiles > 1) @@ -679,7 +694,11 @@ struct DispatchSpmv if (CubDebug(error = cudaPeekAtLastError())) break; // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } } while (0); diff --git a/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/device/dispatch/dispatch_three_way_partition.cuh index cb06438063..f643d83980 100644 --- a/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -360,12 +360,10 @@ struct DispatchThreeWayPartitionIf } // Sync the stream if specified to flush runtime errors - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = cub::SyncStream(stream))) - { - break; - } + break; } // Get max x-dimension of grid @@ -430,12 +428,10 @@ struct DispatchThreeWayPartitionIf } // Sync the stream if specified to flush runtime errors - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = cub::SyncStream(stream))) - { - break; - } + break; } } while (0); diff --git a/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/device/dispatch/dispatch_unique_by_key.cuh index 971424bca0..9eb96f0cb9 100644 --- a/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -282,7 +282,11 @@ struct DispatchUniqueByKey: SelectedPolicy if (CubDebug(error = cudaPeekAtLastError())) break; // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Return if empty problem if (num_items == 0) break; @@ -341,7 +345,11 @@ struct DispatchUniqueByKey: SelectedPolicy } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while(0); diff --git a/cub/util_device.cuh b/cub/util_device.cuh index 6870cc49e6..ad145f6b97 100644 --- a/cub/util_device.cuh +++ b/cub/util_device.cuh @@ -517,6 +517,44 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream) return result; } +namespace detail +{ + +/** + * Same as SyncStream, but intended for use with the debug_synchronous flags + * in device algorithms. This should not be used if synchronization is required + * for correctness. + * + * If `debug_synchronous` is false, this function will immediately return + * cudaSuccess. If true, one of the following will occur: + * + * If synchronization is supported by the current compilation target and + * settings, the sync is performed and the sync result is returned. + * + * If syncs are not supported then no sync is performed, but a message is logged + * via _CubLog and cudaSuccess is returned. + */ +CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream, + bool debug_synchronous) +{ + if (!debug_synchronous) + { + return cudaSuccess; + } + +#if 1 // All valid targets currently support device-side synchronization + _CubLog("%s\n", "Synchronizing..."); + return SyncStream(stream); +#else + (void)stream; + _CubLog("%s\n", + "WARNING: Skipping CUB `debug_synchronous` synchronization " + "(unsupported target)."); + return cudaSuccess; +#endif +} + +} // namespace detail /** * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.