diff --git a/src/cuda/beam_search_scorer_cuda.cpp b/src/cuda/beam_search_scorer_cuda.cpp
index 61fc86accb..964dcf430a 100644
--- a/src/cuda/beam_search_scorer_cuda.cpp
+++ b/src/cuda/beam_search_scorer_cuda.cpp
@@ -4,6 +4,7 @@
 #include "generators.h"
 #include "search.h"
 #include "search_cuda.h"
+#include "cuda_common.h"
 #include "beam_search_scorer_cuda.cuh"
 #include "beam_search_scorer_cuda.h"
 #include "interface.h"
@@ -22,7 +23,7 @@ BeamSearchScorer_Cuda::BeamSearchScorer_Cuda(const GeneratorParams& parameters,
   state_cpu_->not_done_count_ = parameters.search.batch_size;
   state_cpu_->hypothesis_buffer_used_ = 0;
   state_gpu_ = CudaMallocArray<cuda::BeamScorerState>(1);
-  cudaMemcpyAsync(state_gpu_.get(), state_cpu_.get(), sizeof(cuda::BeamScorerState), ::cudaMemcpyHostToDevice, stream_);
+  CUDA_CHECK(cudaMemcpyAsync(state_gpu_.get(), state_cpu_.get(), sizeof(cuda::BeamScorerState), ::cudaMemcpyHostToDevice, stream_));
 
   size_t batch_beam_size = state_cpu_->batch_size_ * state_cpu_->num_beams_;
 
@@ -63,7 +64,7 @@ void BeamSearchScorer_Cuda::Process(Sequences& sequences,
                                        next_tokens,
                                        next_indices,
                                        stream_);
-  cudaEventRecord(event_process_complete_, stream_);
+  CUDA_CHECK(cudaEventRecord(event_process_complete_, stream_));
 
   cuda::LaunchBeamSearchScorer_AppendNextTokenToSequences(*state_cpu_,
                                                           *state_gpu_,
@@ -76,7 +77,7 @@ void BeamSearchScorer_Cuda::Process(Sequences& sequences,
 }
 
 bool BeamSearchScorer_Cuda::IsDoneLater() const {
-  cudaEventSynchronize(event_process_complete_);
+  CUDA_CHECK(cudaEventSynchronize(event_process_complete_));
   return state_cpu_->not_done_count_ == 0;
 }
 
@@ -90,7 +91,7 @@ DeviceSpan<int32_t> BeamSearchScorer_Cuda::GetBeamHypothesis(size_t batch_id, si
   cuda_host_unique_ptr<int> hypothesis_length = CudaMallocHostArray<int>(1);
   cuda_host_unique_ptr<float> hypothesis_score = CudaMallocHostArray<float>(1);
   cuda::LaunchBeamSearchScorer_GetHypothesisPtr(batch_id, beam_id, beam_hyps_, hypothesis_ptr.get(), hypothesis_length.get(), hypothesis_score.get(), stream_);
-  CudaCheck() == cudaStreamSynchronize(stream_);
+  CUDA_CHECK(cudaStreamSynchronize(stream_));
   std::span<int32_t> hypothesis(*hypothesis_ptr.get(), *hypothesis_length.get());
   // Translate the hypothesis span back to the original device buffer span
   return hypothesis_buffer_.subspan(hypothesis.data() - hypothesis_buffer_.Span().data(), hypothesis.size());
diff --git a/src/cuda/beam_search_scorer_cuda.cu b/src/cuda/beam_search_scorer_cuda.cu
index d7247d55e4..58c72b45d7 100644
--- a/src/cuda/beam_search_scorer_cuda.cu
+++ b/src/cuda/beam_search_scorer_cuda.cu
@@ -5,6 +5,7 @@
 #include <assert.h>
 #include <algorithm>
 #include "span.h"
+#include "cuda_common.h"
 #include "beam_search_scorer_cuda.cuh"
 
 namespace Generators {
@@ -49,6 +50,7 @@ void LaunchInitializeBeamHypotheses(std::span<BeamHypotheses> beam_hyps,
                                                                              length_penalty,
                                                                              beams.data(),
                                                                              num_beams);
+  CUDA_CHECK_LAUNCH();
 }
 
 __device__ void BeamHypotheses::Add(const int32_t* hypothesis, int hypothesis_length, float sum_logprobs) {
@@ -187,6 +189,7 @@ void LaunchBeamSearchScorer_Process(BeamScorerState& state_cpu,
                                                                     next_scores.data(),
                                                                     next_tokens.data(),
                                                                     next_indices.data());
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void BeamSearchScorer_AppendNextTokenToSequences1(BeamScorerState& state,
@@ -254,6 +257,7 @@ void LaunchBeamSearchScorer_AppendNextTokenToSequences(BeamScorerState& state_cp
                                                                                   next_sequences.data(),
                                                                                   sequence_length,
                                                                                   next_beam_tokens.data());
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void BeamSearchScorer_Finalize(BeamScorerState& state,
@@ -298,6 +302,7 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                                           beam_hyps.data(),
                                                           hypothesis_buffer.data(),
                                                           final_beam_scores.data());
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void BeamSearchScorer_GetHypothesisPtr(size_t batch_id,
@@ -326,6 +331,7 @@ void LaunchBeamSearchScorer_GetHypothesisPtr(size_t batch_id,
                                                          hypothesis_ptr,
                                                          hypothesis_length,
                                                          hypothesis_score);
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void InitScoresKernel(float* beam_scores,
@@ -347,6 +353,7 @@ void LaunchInitScoresKernel(
   constexpr int blockSize = 256;
   const int gridSize = (total_elements + blockSize - 1) / blockSize;
   InitScoresKernel<<<gridSize, blockSize, 0, stream>>>(beam_scores, num_beams, total_elements);
+  CUDA_CHECK_LAUNCH();
 }
 
 }  // namespace cuda
diff --git a/src/cuda/beam_search_topk.cu b/src/cuda/beam_search_topk.cu
index 32da76fa2b..4c930413d1 100644
--- a/src/cuda/beam_search_topk.cu
+++ b/src/cuda/beam_search_topk.cu
@@ -5,6 +5,7 @@
 #include <cub/cub.cuh>
 #include <limits>
 #include "beam_search_topk.h"
+#include "cuda_common.h"
 
 namespace Generators {
 namespace cuda {
@@ -124,17 +125,20 @@ void LaunchBeamSearchOnlineTopKStage2Kernel(
   if (parts_per_beam <= 32) {
     BeamSearchOnlineTopKStage2Kernel<T, max_k, 32><<<batch_beam_size, 32, smem_stage2_size, stream>>>(
         topk_values_tmp, topk_indices_tmp, K, vocab_size, parts_per_beam, output_values, output_indices);
+    CUDA_CHECK_LAUNCH();
     return;
   }
 
   if (parts_per_beam <= 64) {
     BeamSearchOnlineTopKStage2Kernel<T, max_k, 64><<<batch_beam_size, 64, smem_stage2_size, stream>>>(
         topk_values_tmp, topk_indices_tmp, K, vocab_size, parts_per_beam, output_values, output_indices);
+    CUDA_CHECK_LAUNCH();
     return;
   }
 
   BeamSearchOnlineTopKStage2Kernel<T, max_k, 128><<<batch_beam_size, 128, smem_stage2_size, stream>>>(
       topk_values_tmp, topk_indices_tmp, K, vocab_size, parts_per_beam, output_values, output_indices);
+  CUDA_CHECK_LAUNCH();
   return;
 }
 
@@ -160,12 +164,13 @@ void TopKLauncherMaxK(
 
   dim3 grid(batch_beam_size, voc_parts);
 
-  cudaFuncSetAttribute(BeamSearchOnlineTopKStage1Kernel<T, max_k, kThreadBlockSize>,
-                       cudaFuncAttributePreferredSharedMemoryCarveout,
-                       cudaSharedmemCarveoutMaxL1);
+  CUDA_CHECK(cudaFuncSetAttribute(BeamSearchOnlineTopKStage1Kernel<T, max_k, kThreadBlockSize>,
+                                  cudaFuncAttributePreferredSharedMemoryCarveout,
+                                  cudaSharedmemCarveoutMaxL1));
 
   BeamSearchOnlineTopKStage1Kernel<T, max_k, kThreadBlockSize>
       <<<grid, kThreadBlockSize, 0, stream>>>(input, K, vocab_size, (vocab_size + voc_parts - 1) / voc_parts, output_values_tmp, output_indices_tmp);
+  CUDA_CHECK_LAUNCH();
 
   LaunchBeamSearchOnlineTopKStage2Kernel<T, max_k>(
       output_values_tmp,
@@ -242,6 +247,7 @@ void LaunchBatchTopKKernel(const T* topk_scores,
   } else {
     BatchTopKKernelLauncher(64);
   }
+  CUDA_CHECK_LAUNCH();
 }
 
 template void LaunchBatchTopKKernel(const float* topk_scores,
diff --git a/src/cuda/cuda_common.h b/src/cuda/cuda_common.h
index 7e283a6a4d..76cd502307 100644
--- a/src/cuda/cuda_common.h
+++ b/src/cuda/cuda_common.h
@@ -16,22 +16,51 @@ namespace Generators {
 
 cudaStream_t GetStream();
 
-void OnCudaError(cudaError_t error);
+#define CeilDiv(a, b) ((a + (b - 1)) / b)
 
-struct CudaCheck {
-  void operator==(cudaError_t error) {
-    if (error != cudaSuccess)
-      OnCudaError(error);
-  }
+class CudaError : public std::runtime_error {
+ public:
+  explicit CudaError(const std::string& msg, cudaError_t code)
+      : std::runtime_error(msg), code_(code) {}
+
+  cudaError_t code() const noexcept { return code_; }
+
+ private:
+  cudaError_t code_;
 };
 
+#define CUDA_CHECK(call)                                         \
+  do {                                                           \
+    cudaError_t err = (call);                                    \
+    if (err != cudaSuccess) {                                    \
+      std::stringstream ss;                                      \
+      ss << "CUDA error in " << __func__ << " at " << __FILE__   \
+         << ":" << __LINE__ << " - " << cudaGetErrorString(err); \
+      (void)cudaGetLastError();                                  \
+      throw Generators::CudaError(ss.str(), err);                \
+    }                                                            \
+  } while (0)
+
+#define CUDA_CHECK_LAUNCH()                               \
+  do {                                                    \
+    cudaError_t err = cudaPeekAtLastError();              \
+    if (err != cudaSuccess) {                             \
+      std::stringstream ss;                               \
+      ss << "CUDA launch error in " << __func__ << " at " \
+         << __FILE__ << ":" << __LINE__ << " - "          \
+         << cudaGetErrorString(err);                      \
+      (void)cudaGetLastError();                           \
+      throw Generators::CudaError(ss.str(), err);         \
+    }                                                     \
+  } while (0)
+
 struct cuda_event_holder {
   cuda_event_holder() {
-    cudaEventCreate(&v_);
+    CUDA_CHECK(cudaEventCreate(&v_));
   }
 
   cuda_event_holder(unsigned flags) {
-    cudaEventCreateWithFlags(&v_, flags);
+    CUDA_CHECK(cudaEventCreateWithFlags(&v_, flags));
   }
 
   ~cuda_event_holder() {
@@ -48,7 +77,7 @@ struct cuda_event_holder {
 struct cuda_stream_holder {
   void Create() {
     assert(!v_);
-    cudaStreamCreate(&v_);
+    CUDA_CHECK(cudaStreamCreate(&v_));
   }
 
   ~cuda_stream_holder() {
@@ -81,7 +110,7 @@ using cuda_host_unique_ptr = std::unique_ptr<T, CudaHostDeleter>;
 template <typename T>
 cuda_host_unique_ptr<T> CudaMallocHostArray(size_t count, std::span<T>* p_span = nullptr) {
   T* p;
-  ::cudaMallocHost(&p, sizeof(T) * count);
+  CUDA_CHECK(::cudaMallocHost(&p, sizeof(T) * count));
   if (p_span)
     *p_span = std::span<T>(p, count);
   return cuda_host_unique_ptr<T>{p};
@@ -99,60 +128,10 @@ using cuda_unique_ptr = std::unique_ptr<T, CudaDeleter>;
 template <typename T>
 cuda_unique_ptr<T> CudaMallocArray(size_t count, std::span<T>* p_span = nullptr) {
   T* p;
-  ::cudaMalloc(&p, sizeof(T) * count);
+  CUDA_CHECK(::cudaMalloc(&p, sizeof(T) * count));
   if (p_span)
     *p_span = std::span<T>(p, count);
   return cuda_unique_ptr<T>{p};
 }
 
-#define CeilDiv(a, b) ((a + (b - 1)) / b)
-
-class CudaError : public std::runtime_error {
- public:
-  explicit CudaError(const std::string& msg, cudaError_t code)
-      : std::runtime_error(msg), code_(code) {}
-
-  cudaError_t code() const noexcept { return code_; }
-
- private:
-  cudaError_t code_;
-};
-
-#define CUDA_CHECK(call)                                         \
-  do {                                                           \
-    cudaError_t err = (call);                                    \
-    if (err != cudaSuccess) {                                    \
-      std::stringstream ss;                                      \
-      ss << "CUDA error in " << __func__ << " at " << __FILE__   \
-         << ":" << __LINE__ << " - " << cudaGetErrorString(err); \
-      throw Generators::CudaError(ss.str(), err);                \
-    }                                                            \
-  } while (0)
-
-#ifdef NDEBUG
-#define CUDA_CHECK_LAUNCH()                               \
-  do {                                                    \
-    cudaError_t err = cudaPeekAtLastError();              \
-    if (err != cudaSuccess) {                             \
-      std::stringstream ss;                               \
-      ss << "CUDA launch error in " << __func__ << " at " \
-         << __FILE__ << ":" << __LINE__ << " - "          \
-         << cudaGetErrorString(err);                      \
-      throw Generators::CudaError(ss.str(), err);         \
-    }                                                     \
-  } while (0)
-#else
-#define CUDA_CHECK_LAUNCH()                               \
-  do {                                                    \
-    cudaError_t err = cudaGetLastError();                 \
-    if (err != cudaSuccess) {                             \
-      std::stringstream ss;                               \
-      ss << "CUDA launch error in " << __func__ << " at " \
-         << __FILE__ << ":" << __LINE__ << " - "          \
-         << cudaGetErrorString(err);                      \
-      throw Generators::CudaError(ss.str(), err);         \
-    }                                                     \
-  } while (0)
-#endif
-
 }  // namespace Generators
diff --git a/src/cuda/cuda_topk.cu b/src/cuda/cuda_topk.cu
index 2a13ae899f..1f9160c4d9 100644
--- a/src/cuda/cuda_topk.cu
+++ b/src/cuda/cuda_topk.cu
@@ -6,6 +6,7 @@
 #include <cassert>
 
 #include "cuda_topk.h"
+#include "cuda_common.h"
 #include "cuda_topk_benchmark_cache.h"
 #include "cuda_topk_benchmark.cuh"
 #include "cuda_topk_common.cuh"
@@ -153,6 +154,7 @@ void TopkDataCompact::CompactOutput(int batch_size, int k, cudaStream_t stream)
   dim3 block(256);
   CompactStridedData<float><<<grid, block, 0, stream>>>(topk_scores, topk_scores_compact.get(), k, batch_size, topk_stride);
   CompactStridedData<int><<<grid, block, 0, stream>>>(topk_indices, topk_indices_compact.get(), k, batch_size, topk_stride);
+  CUDA_CHECK_LAUNCH();
 }
 
 // Main dispatcher for Top-K. It implements the caching and benchmarking logic to select and run the best algorithm.
diff --git a/src/cuda/cuda_topk_benchmark.cuh b/src/cuda/cuda_topk_benchmark.cuh
index 9766e7a9be..e7519daaba 100644
--- a/src/cuda/cuda_topk_benchmark.cuh
+++ b/src/cuda/cuda_topk_benchmark.cuh
@@ -101,6 +101,12 @@ static TopkAlgo BenchmarkAndSelectBestAlgo(TopkData* topk_data,
                                            int vocab_size,
                                            int batch_size,
                                            int k) {
+  // Clear any stale CUDA errors from previous operations to prevent false failures.
+  // Successful CUDA API calls do NOT clear the thread-local error state, so a stale
+  // error (e.g., from TopkData construction or prior inference) can persist and be
+  // falsely detected by CUDA_CHECK_LAUNCH() inside the benchmark kernels.
+  cudaGetLastError();
+
   float min_latency = std::numeric_limits<float>::max();
   TopkAlgo best_algo = TopkAlgo::UNKNOWN;
 
@@ -147,9 +153,11 @@ static TopkAlgo BenchmarkAndSelectBestAlgo(TopkData* topk_data,
     });
   }
 
-  // Candidate: Hybrid Sort. This is a robust fallback. We benchmark it if either the cooperative
-  // kernels are not supported, or if the vocab size is small, where hybrid can sometimes be faster.
-  if (!use_iterative_sort && !use_cascaded_sort && !use_flash_convergent || vocab_size <= 4096) {
+  // Candidate: Hybrid Sort. This is a robust fallback. We benchmark it if the cooperative
+  // kernels are not supported, if their benchmarks all failed at runtime (best_algo is still
+  // UNKNOWN despite IsSupported returning true), or if the vocab size is small, where hybrid
+  // can sometimes be faster.
+  if (best_algo == TopkAlgo::UNKNOWN || (!use_iterative_sort && !use_cascaded_sort && !use_flash_convergent) || vocab_size <= 4096) {
     if (hybrid_sort::IsSupported(batch_size, vocab_size, k)) {
       BENCHMARK_KERNEL(TopkAlgo::HYBRID, [&]() {
         hybrid_sort::RunTopK(topk_data, stream, scores_in, vocab_size, batch_size, k);
diff --git a/src/cuda/cuda_topk_per_batch_radix_sort.cuh b/src/cuda/cuda_topk_per_batch_radix_sort.cuh
index 6fb4329517..e1c0567119 100644
--- a/src/cuda/cuda_topk_per_batch_radix_sort.cuh
+++ b/src/cuda/cuda_topk_per_batch_radix_sort.cuh
@@ -75,8 +75,9 @@ void RunTopK(TopkData* data, cudaStream_t stream, const float* scores_in, int vo
 
     // Populate workspace buffers with the current batch item's scores and indices.
     FillInput<<<blocks_per_batch, block_size, 0, stream>>>(current_scores_in, workspace_scores, workspace_indices, vocab_size);
+    CUDA_CHECK_LAUNCH();
     // Launch the CUB radix sort. It sorts from the workspace directly into the final output buffers.
-    cub::DeviceRadixSort::SortPairsDescending(temp_storage, temp_storage_bytes, workspace_scores, final_scores_out, workspace_indices, final_indices_out, vocab_size, 0, sizeof(float) * 8, stream);
+    CUDA_CHECK(cub::DeviceRadixSort::SortPairsDescending(temp_storage, temp_storage_bytes, workspace_scores, final_scores_out, workspace_indices, final_indices_out, vocab_size, 0, sizeof(float) * 8, stream));
   }
 
   data->topk_scores = final_scores_buffer;
diff --git a/src/cuda/model_kernels.cu b/src/cuda/model_kernels.cu
index 09f42ae903..0f96276b8a 100644
--- a/src/cuda/model_kernels.cu
+++ b/src/cuda/model_kernels.cu
@@ -7,6 +7,7 @@
 #include <limits>
 #include <assert.h>
 #include <stdio.h>
+#include "cuda_common.h"
 
 namespace Generators {
 namespace cuda {
@@ -36,6 +37,7 @@ void Launch_UpdatePositionIds(T* positions, int batch_beam_size, int total_lengt
     // For batch size > 1 we increment position ids by 1... continuous decoding is not supported
     UpdatePositionIds<T><<<(batch_beam_size + 255) / 256, 256, 0, stream>>>(positions, batch_beam_size);
   }
+  CUDA_CHECK_LAUNCH();
 }
 
 template void Launch_UpdatePositionIds(int32_t* positions, int batch_beam_size, int total_length, int new_kv_length, cudaStream_t stream);
@@ -77,6 +79,7 @@ void Launch_UpdateAttentionMask(T* next_mask_data, T* mask_data, int batch_beam_
     int blocks = (batch_beam_size * total_length + threads - 1) / threads;
     CopyAndUpdateAttentionMask<T><<<blocks, threads, 0, stream>>>(next_mask_data, mask_data, batch_beam_size, new_kv_length, total_length);
   }
+  CUDA_CHECK_LAUNCH();
 }
 
 template void Launch_UpdateAttentionMask(int32_t* next_mask_data, int32_t* mask_data, int batch_beam_size, int new_kv_length, int total_length, int max_length, bool update_only, cudaStream_t stream);
@@ -96,6 +99,7 @@ void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_siz
   int block_size = 256;
   int num_blocks = (batch_beam_size * vocab_size + block_size - 1) / block_size;
   AddLogitsMask<<<num_blocks, block_size, 0, stream>>>(batch_logits, batch_beam_size, vocab_size, logits_mask);
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void ConvertFp16ToFp32(const half* src, float* dst, int count) {
@@ -108,6 +112,7 @@ void LaunchFp16ToFp32(const uint16_t* fp16, float* fp32, int count, cudaStream_t
   int block_size = 256;
   int num_blocks = (count + block_size - 1) / block_size;
   ConvertFp16ToFp32<<<num_blocks, block_size, 0, stream>>>(reinterpret_cast<const half*>(fp16), fp32, count);
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void ConvertFp32ToFp16(const float* src, half* dst, int count) {
@@ -120,6 +125,7 @@ void LaunchFp32ToFp16(const float* fp32, uint16_t* fp16, int count, cudaStream_t
   int block_size = 256;
   int num_blocks = (count + block_size - 1) / block_size;
   ConvertFp32ToFp16<<<num_blocks, block_size, 0, stream>>>(fp32, reinterpret_cast<half*>(fp16), count);
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void ConvertInt32ToInt64(const int32_t* src, int64_t* dst, int count) {
@@ -133,6 +139,7 @@ void LaunchInt32ToInt64(const int32_t* src, int64_t* dst, int count, cudaStream_
   int block_size = 256;
   int num_blocks = (count + block_size - 1) / block_size;
   ConvertInt32ToInt64<<<num_blocks, block_size, 0, stream>>>(src, dst, count);
+  CUDA_CHECK_LAUNCH();
 }
 
 namespace {
@@ -197,6 +204,7 @@ void ReorderPastStatesKernelLauncher(void* out_buffer,
                                                         num_heads,
                                                         max_length,
                                                         chunked_head_size);
+    CUDA_CHECK_LAUNCH();
   }
 }
 
@@ -258,6 +266,7 @@ void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache,
                                                            input_seq_length,
                                                            max_seq_length,
                                                            current_length);
+  CUDA_CHECK_LAUNCH();
 }
 
 template <typename T>
@@ -325,6 +334,7 @@ void LaunchCopyCrossQKSingleDecodeStep(cudaStream_t stream,
                                                                   max_length,
                                                                   sequence_length);
   }
+  CUDA_CHECK_LAUNCH();
 }
 
 template void LaunchCopyCrossQKSingleDecodeStep(cudaStream_t stream,
@@ -419,6 +429,7 @@ void LaunchFinalizeCrossQK(cudaStream_t stream,
                                                                  cross_qk_output,
                                                                  cache_indir_data);
   }
+  CUDA_CHECK_LAUNCH();
 }
 
 template void LaunchFinalizeCrossQK(cudaStream_t stream,
diff --git a/src/cuda/search_cuda.cpp b/src/cuda/search_cuda.cpp
index b306d0f473..cc4d5c01cf 100644
--- a/src/cuda/search_cuda.cpp
+++ b/src/cuda/search_cuda.cpp
@@ -5,6 +5,7 @@
 #include "interface.h"
 #include "search.h"
 #include "search_cuda.h"
+#include "cuda_common.h"
 #include "beam_search_scorer_cuda.cuh"
 #include "beam_search_scorer_cuda.h"
 #include "beam_search_topk.h"
@@ -13,12 +14,6 @@
 
 namespace Generators {
 
-void OnCudaError(cudaError_t error) {
-  printf("Cuda Error: %s\n", cudaGetErrorString(error));
-  assert(false);
-  throw std::exception();
-}
-
 Search_Cuda::Search_Cuda(const GeneratorParams& params)
     : Search{params} {
   auto batch_beam_size = params.BatchBeamSize();
@@ -69,14 +64,14 @@ BeamSearch_Cuda::BeamSearch_Cuda(const GeneratorParams& params)
   topk_buffer_ = CudaMallocArray<float>(topk_buffer_size);
   static_assert(sizeof(float) == sizeof(int32_t));  // The topk_buffer assumes these match, fix for float16
 
-  cudaMemsetAsync(topk_buffer_.get(), 0, topk_buffer_size * sizeof(float), GetStream());
+  CUDA_CHECK(cudaMemsetAsync(topk_buffer_.get(), 0, topk_buffer_size * sizeof(float), GetStream()));
 }
 
 BeamSearch_Cuda::~BeamSearch_Cuda() = default;
 
 void Search_Cuda::ResetDone() {
   *done_cpu_ = false;
-  cudaMemsetAsync(eos_seen_.data(), 0, eos_seen_.size_bytes(), GetStream());
+  CUDA_CHECK(cudaMemsetAsync(eos_seen_.data(), 0, eos_seen_.size_bytes(), GetStream()));
 }
 
 DeviceSpan<float> Search_Cuda::GetLogits() const {
@@ -105,8 +100,8 @@ void BeamSearch_Cuda::SelectTop() {
 
   // Copy next_token_scores to CPU
   auto next_token_scores_cpu = CudaMallocHostArray<float>(params_->BatchBeamSize() * params_->config.model.vocab_size);
-  cudaMemcpyAsync(next_token_scores_cpu.get(), softmax_buffer_.get(), params_->BatchBeamSize() * params_->config.model.vocab_size * sizeof(float), cudaMemcpyDeviceToHost, GetStream());
-  CudaCheck() == cudaStreamSynchronize(GetStream());
+  CUDA_CHECK(cudaMemcpyAsync(next_token_scores_cpu.get(), softmax_buffer_.get(), params_->BatchBeamSize() * params_->config.model.vocab_size * sizeof(float), cudaMemcpyDeviceToHost, GetStream()));
+  CUDA_CHECK(cudaStreamSynchronize(GetStream()));
 
   auto beam_scores = beam_scorer_->GetNextScores();
 
@@ -140,7 +135,7 @@ void BeamSearch_Cuda::SelectTop() {
   } else
     assert(false);
 
-  CudaCheck() == cudaStreamSynchronize(GetStream());
+  CUDA_CHECK(cudaStreamSynchronize(GetStream()));
 
   size_t size = params_->BatchBeamSize() * 2;
   std::span<float> next_scores{topk_next_scores_.get(), size};
@@ -170,7 +165,7 @@ void GreedySearch_Cuda::SampleTopKTopP(int k, float p, float temperature) {
   cuda::Launch_CheckForEOSAndPad(next_tokens_.data(), static_cast<int>(next_tokens_.size()), eos_seen_.data(), eos_token_ids_.Span().data(), static_cast<int>(eos_token_ids_.Span().size()), params_->config.model.pad_token_id, done_cpu_.get(), GetStream());
 
   // Append tokens
-  cudaStreamSynchronize(GetStream());
+  CUDA_CHECK(cudaStreamSynchronize(GetStream()));
   if (!*done_cpu_) {
     cuda::Launch_AppendNextTokensToSequences(next_tokens_buffer_.Span(), sequences_.GetSequences().Span(), params_->BatchBeamSize(), sequences_.GetSequenceLength(), sequences_.max_length_, GetStream());
     sequences_.AfterAppendNextTokens(next_tokens_buffer_, params_->BatchBeamSize());
@@ -246,7 +241,7 @@ void BeamSearch_Cuda::AppendTokens(DeviceSpan<int32_t>& next_tokens) {
   cuda::Launch_ExpandInputSequences(next_tokens_gpu, sequences_.GetNextSequences().Span(), params_->search.batch_size, params_->search.num_beams, sequences_.max_length_, GetStream());
   cuda::Launch_ExpandInputSequences(next_tokens_gpu, sequences_.GetSequences().Span(), params_->search.batch_size, params_->search.num_beams, sequences_.max_length_, GetStream());
   sequences_.AfterAppendNextTokens(next_tokens, params_->search.batch_size);  // next_tokens is batch_size
-  cudaStreamSynchronize(GetStream());
+  CUDA_CHECK(cudaStreamSynchronize(GetStream()));
 }
 
 void GreedySearch_Cuda::RewindTo(size_t index) {
@@ -254,7 +249,7 @@ void GreedySearch_Cuda::RewindTo(size_t index) {
   if (index > 0)
     cuda::Launch_GetLastTokens(next_tokens_.data(), sequences_.GetSequences().Span().data(), static_cast<int>(params_->BatchBeamSize()), static_cast<int>(index), sequences_.max_length_, GetStream());
   else
-    cudaMemsetAsync(next_tokens_.data(), 0, params_->search.batch_size * sizeof(int32_t), GetStream());
+    CUDA_CHECK(cudaMemsetAsync(next_tokens_.data(), 0, params_->search.batch_size * sizeof(int32_t), GetStream()));
   sequences_.RewindTo(index);
 }
 
diff --git a/src/cuda/search_cuda.cu b/src/cuda/search_cuda.cu
index e72689291b..f5b52fe9bd 100644
--- a/src/cuda/search_cuda.cu
+++ b/src/cuda/search_cuda.cu
@@ -27,6 +27,7 @@ void Launch_ExpandInputSequences(const std::span<int32_t> input_sequences, std::
   const int total_elements = static_cast<int>(input_sequences.size());
   const int new_length = total_elements / batch_size;
   ExpandInputSequences<<<1, 1, 0, stream>>>(input_sequences.data(), sequences.data(), batch_size, beam_size, new_length, max_length);
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void AppendNextTokensToSequences(const int32_t* next_tokens, int32_t* sequences, int batch_beam_size, int past_length, int new_length, int max_length) {
@@ -45,6 +46,7 @@ void Launch_AppendNextTokensToSequences(std::span<const int32_t> next_tokens, st
   const int gridSize = (total_elements + blockSize - 1) / blockSize;
   const int new_length = total_elements / batch_beam_size;
   AppendNextTokensToSequences<<<gridSize, blockSize, 0, stream>>>(next_tokens.data(), sequences.data(), batch_beam_size, past_length, new_length, max_length);
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void GetLastTokens(int32_t* next_tokens, const int32_t* sequences, int batch_beam_size, int sequence_length, int max_length) {
@@ -59,6 +61,7 @@ void Launch_GetLastTokens(int32_t* next_tokens, const int32_t* sequences, int ba
   const int blockSize = std::min(batch_beam_size, 256);
   const int gridSize = (batch_beam_size + blockSize - 1) / blockSize;
   GetLastTokens<<<gridSize, blockSize, 0, stream>>>(next_tokens, sequences, batch_beam_size, sequence_length, max_length);
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void ArgMax(cub::KeyValuePair<int, float>* argmaxen, int32_t* next_tokens, int batch_size) {
@@ -110,6 +113,7 @@ __global__ void CheckForEOSAndPad(int32_t* next_tokens, int next_tokens_count, b
 
 void Launch_CheckForEOSAndPad(int32_t* next_tokens, int next_tokens_count, bool* eos_seen, const int* eos_token_ids, int eos_token_count, int pad_token_id, bool* done_cpu, cudaStream_t stream) {
   CheckForEOSAndPad<<<1, 1, 0, stream>>>(next_tokens, next_tokens_count, eos_seen, eos_token_ids, eos_token_count, pad_token_id, done_cpu);
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void AddProbsKernel(float* log_probs,
@@ -133,6 +137,7 @@ void LaunchAddProbsKernel(float* log_probs,
   constexpr int blockSize = 256;
   const int gridSize = (total_elements + blockSize - 1) / blockSize;
   AddProbsKernel<<<gridSize, blockSize, 0, stream>>>(log_probs, cum_log_probs, vocab_size, total_elements);
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void SetScoreProcessor(float* next_token_scores, int batch_beam_size, int vocab_size, int token, float score) {
@@ -149,6 +154,7 @@ void LaunchSetScoreProcessor(float* next_token_scores, int batch_beam_size, int
   const int gridSize = (total_elements + blockSize - 1) / blockSize;
 
   SetScoreProcessor<<<gridSize, blockSize, 0, stream>>>(next_token_scores, batch_beam_size, vocab_size, token, score);
+  CUDA_CHECK_LAUNCH();
 }
 
 __global__ void RepetitionPenaltyProcessor(const int32_t* sequences, float* next_token_scores, int max_sequence_length, int vocab_size, int total_elements, int current_sequence_length, float repetition_penalty) {
@@ -179,6 +185,7 @@ void LaunchRepetitionPenaltyProcessor(const int32_t* sequences, float* next_toke
   const int gridSize = (total_elements + blockSize - 1) / blockSize;
 
   RepetitionPenaltyProcessor<<<gridSize, blockSize, 0, stream>>>(sequences, next_token_scores, max_sequence_length, vocab_size, total_elements, current_sequence_length, repetition_penalty);
+  CUDA_CHECK_LAUNCH();
 }
 
 }  // namespace cuda