|
9 | 9 | #include <algorithm> |
10 | 10 | #include <sstream> |
11 | 11 | #include <unordered_set> |
| 12 | +#include <regex> |
12 | 13 |
|
13 | 14 | #if defined(__APPLE__) && defined(__MACH__) |
14 | 15 | #include <sys/types.h> |
@@ -295,6 +296,40 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { |
295 | 296 | fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); |
296 | 297 | fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); |
297 | 298 | #endif |
| 299 | + } else if (arg == "--main-gpu" || arg == "-mg") { |
| 300 | + if (++i >= argc) { |
| 301 | + invalid_param = true; |
| 302 | + break; |
| 303 | + } |
| 304 | +#ifdef GGML_USE_CUBLAS |
| 305 | + params.main_gpu = std::stoi(argv[i]); |
| 306 | +#else |
| 307 | + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); |
| 308 | +#endif |
| 309 | + } else if (arg == "--tensor-split" || arg == "-ts") { |
| 310 | + if (++i >= argc) { |
| 311 | + invalid_param = true; |
| 312 | + break; |
| 313 | + } |
| 314 | +#ifdef GGML_USE_CUBLAS |
| 315 | + std::string arg_next = argv[i]; |
| 316 | + |
| 317 | + // split string by , and / |
| 318 | + const std::regex regex{R"([,/]+)"}; |
| 319 | + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; |
| 320 | + std::vector<std::string> split_arg{it, {}}; |
| 321 | + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); |
| 322 | + |
| 323 | + for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { |
| 324 | + if (i < split_arg.size()) { |
| 325 | + params.tensor_split[i] = std::stof(split_arg[i]); |
| 326 | + } else { |
| 327 | + params.tensor_split[i] = 0.0f; |
| 328 | + } |
| 329 | + } |
| 330 | +#else |
| 331 | + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); |
| 332 | +#endif // GGML_USE_CUBLAS |
298 | 333 | } else if (arg == "--no-mmap") { |
299 | 334 | params.use_mmap = false; |
300 | 335 | } else if (arg == "--mtest") { |
@@ -438,6 +473,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { |
438 | 473 | #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD |
439 | 474 | fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); |
440 | 475 | fprintf(stderr, " number of layers to store in VRAM\n"); |
| 476 | + fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); |
| 477 | + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); |
| 478 | + fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); |
441 | 479 | #endif |
442 | 480 | fprintf(stderr, " --mtest compute maximum memory usage\n"); |
443 | 481 | fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); |
@@ -483,7 +521,10 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { |
483 | 521 | auto lparams = llama_context_default_params(); |
484 | 522 |
|
485 | 523 | lparams.n_ctx = params.n_ctx; |
| 524 | + lparams.n_batch = params.n_batch; |
486 | 525 | lparams.n_gpu_layers = params.n_gpu_layers; |
| 526 | + lparams.main_gpu = params.main_gpu; |
| 527 | + memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float)); |
487 | 528 | lparams.seed = params.seed; |
488 | 529 | lparams.f16_kv = params.memory_f16; |
489 | 530 | lparams.use_mmap = params.use_mmap; |
|
0 commit comments