@@ -467,6 +467,7 @@ struct llama_server_context
467467 bool all_slots_are_idle = false ;
468468 bool add_bos_token = true ;
469469 bool has_eos_token = true ;
470+ bool has_gpu = false ;
470471
471472 bool grammar_lazy = false ;
472473 std::vector<common_grammar_trigger> grammar_triggers;
@@ -512,7 +513,7 @@ struct llama_server_context
512513 multimodal = true ;
513514 LOG_INFO (" Multi Modal Mode Enabled" , {});
514515 clp_ctx = clip_init (params.mmproj .c_str (), clip_context_params {
515- /* use_gpu */ false ,
516+ /* use_gpu */ has_gpu ,
516517 /* verbosity=*/ 1 ,
517518 });
518519 if (clp_ctx == nullptr ) {
@@ -2317,7 +2318,7 @@ static std::string get_all_kv_cache_types() {
23172318}
23182319
23192320static void params_parse (const backend::ModelOptions* request,
2320- common_params & params) {
2321+ common_params & params, llama_server_context &llama ) {
23212322
23222323 // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
23232324
@@ -2355,6 +2356,20 @@ static void params_parse(const backend::ModelOptions* request,
23552356 add_rpc_devices (std::string (llama_grpc_servers));
23562357 }
23572358
2359+ // decode options. Options are in form optname:optvale, or if booleans only optname.
2360+ for (int i = 0 ; i < request->options_size (); i++) {
2361+ std::string opt = request->options (i);
2362+ char *optname = strtok (&opt[0 ], " :" );
2363+ char *optval = strtok (NULL , " :" );
2364+ if (optval == NULL ) {
2365+ optval = " true" ;
2366+ }
2367+
2368+ if (!strcmp (optname, " gpu" )) {
2369+ llama.has_gpu = true ;
2370+ }
2371+ }
2372+
23582373 // TODO: Add yarn
23592374
23602375 if (!request->tensorsplit ().empty ()) {
@@ -2448,7 +2463,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
24482463 grpc::Status LoadModel (ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
24492464 // Implement LoadModel RPC
24502465 common_params params;
2451- params_parse (request, params);
2466+ params_parse (request, params, llama );
24522467
24532468 llama_backend_init ();
24542469 llama_numa_init (params.numa );
0 commit comments