diff --git a/benchmark/nixlbench/src/worker/nixl/nixl_worker.cpp b/benchmark/nixlbench/src/worker/nixl/nixl_worker.cpp index f376678bc..e48dae4c8 100644 --- a/benchmark/nixlbench/src/worker/nixl/nixl_worker.cpp +++ b/benchmark/nixlbench/src/worker/nixl/nixl_worker.cpp @@ -633,7 +633,17 @@ xferBenchNixlWorker::cleanupBasicDescVram(xferBenchIOV &iov) { CHECK_CUDA_DRIVER_ERROR(cuMemAddressFree(iov.addr, iov.padded_size), "Failed to free reserved address"); } else { - CHECK_CUDA_ERROR(cudaFree((void *)iov.addr), "Failed to deallocate CUDA buffer"); + /* + * CUDA streams allow for concurrent execution of kernels and memory operations. However, + * memory management functions like cudaFree are implicitly synchronized with all streams to + * guarantee safety. This means cudaFree will wait for all kernels (in any stream) that + * might use the memory to finish before actually freeing it. + * If the application hangs on cudaFree due to kernels running in other streams, switching + * to cudaFreeAsync can allow the host to proceed without waiting for the entire device + * synchronization. + */ + CHECK_CUDA_ERROR(cudaFreeAsync((void *)iov.addr, 0), "Failed to deallocate CUDA buffer"); + CHECK_CUDA_ERROR(cudaStreamSynchronize(0), "Failed to synchronize stream 0"); } } #endif /* HAVE_CUDA */