Skip to content
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4857,6 +4857,23 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
// Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance
// PR: https://github.com/ggml-org/llama.cpp/pull/19042
if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) {
#ifdef _WIN32
_putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x");
#else
setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set
#endif

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#endif
#endif // _WIN32


GGML_LOG_WARN("\n");
GGML_LOG_WARN("================================================================================\n");
GGML_LOG_WARN(" CUDA_SCALE_LAUNCH_QUEUES=4x has been enabled\n");
GGML_LOG_WARN(" This environment variable improves performance with multiple GPUs\n");
GGML_LOG_WARN("================================================================================\n");
GGML_LOG_WARN("\n");
}

ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;

Expand Down
Loading