Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions src/cuda/cuda_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,16 @@ class CudaError : public std::runtime_error {
} while (0)

#ifdef NDEBUG
#define CUDA_CHECK_LAUNCH() \
do { \
cudaError_t err = cudaPeekAtLastError(); \
if (err != cudaSuccess) { \
std::stringstream ss; \
ss << "CUDA launch error in " << __func__ << " at " \
<< __FILE__ << ":" << __LINE__ << " - " \
<< cudaGetErrorString(err); \
throw Generators::CudaError(ss.str(), err); \
Comment thread
tianleiwu marked this conversation as resolved.
} \
#define CUDA_CHECK_LAUNCH() \
do { \
cudaError_t err = cudaPeekAtLastError(); \
if (err != cudaSuccess) { \
std::stringstream ss; \
ss << "CUDA launch error in " << __func__ << " at " \
<< __FILE__ << ":" << __LINE__ << " - " \
<< cudaGetErrorString(err); \
throw Generators::CudaError(ss.str(), cudaGetLastError()); \
} \
} while (0)
#else
#define CUDA_CHECK_LAUNCH() \
Expand Down
14 changes: 11 additions & 3 deletions src/cuda/cuda_topk_benchmark.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ static TopkAlgo BenchmarkAndSelectBestAlgo(TopkData* topk_data,
int vocab_size,
int batch_size,
int k) {
// Clear any stale CUDA errors from previous operations to prevent false failures.
// Successful CUDA API calls do NOT clear the thread-local error state, so a stale
// error (e.g., from TopkData construction or prior inference) can persist and be
// falsely detected by CUDA_CHECK_LAUNCH() inside the benchmark kernels.
Comment thread
tianleiwu marked this conversation as resolved.
cudaGetLastError();

float min_latency = std::numeric_limits<float>::max();
TopkAlgo best_algo = TopkAlgo::UNKNOWN;

Expand Down Expand Up @@ -147,9 +153,11 @@ static TopkAlgo BenchmarkAndSelectBestAlgo(TopkData* topk_data,
});
}

// Candidate: Hybrid Sort. This is a robust fallback. We benchmark it if either the cooperative
// kernels are not supported, or if the vocab size is small, where hybrid can sometimes be faster.
if (!use_iterative_sort && !use_cascaded_sort && !use_flash_convergent || vocab_size <= 4096) {
// Candidate: Hybrid Sort. This is a robust fallback. We benchmark it if the cooperative
// kernels are not supported, if their benchmarks all failed at runtime (best_algo is still
// UNKNOWN despite IsSupported returning true), or if the vocab size is small, where hybrid
// can sometimes be faster.
if (best_algo == TopkAlgo::UNKNOWN || vocab_size <= 4096) {
Comment thread
tianleiwu marked this conversation as resolved.
Outdated
if (hybrid_sort::IsSupported(batch_size, vocab_size, k)) {
BENCHMARK_KERNEL(TopkAlgo::HYBRID, [&]() {
hybrid_sort::RunTopK(topk_data, stream, scores_in, vocab_size, batch_size, k);
Expand Down
Loading