@@ -372,8 +372,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
372372 params.multiline_input = true ;
373373 } else if (arg == " --simple-io" ) {
374374 params.simple_io = true ;
375- } else if (arg == " --hot-plug " ) {
376- params.hot_plug = true ;
375+ } else if (arg == " -cb " || arg == " --cont-batching " ) {
376+ params.cont_batching = true ;
377377 } else if (arg == " --color" ) {
378378 params.use_color = true ;
379379 } else if (arg == " --mlock" ) {
@@ -675,7 +675,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
675675 printf (" --chunks N max number of chunks to process (default: %d, -1 = all)\n " , params.n_chunks );
676676 printf (" -np N, --parallel N number of parallel sequences to decode (default: %d)\n " , params.n_parallel );
677677 printf (" -ns N, --sequences N number of sequences to decode (default: %d)\n " , params.n_sequences );
678- printf (" --hot-plug enable hot-plugging of new sequences for decoding (default: disabled)\n " );
678+ printf (" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n " );
679679 if (llama_mlock_supported ()) {
680680 printf (" --mlock force system to keep model in RAM rather than swapping or compressing\n " );
681681 }
@@ -1270,7 +1270,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
12701270 fprintf (stream, " rope_freq_scale: %f # default: 1.0\n " , params.rope_freq_scale );
12711271 fprintf (stream, " seed: %d # default: -1 (random seed)\n " , params.seed );
12721272 fprintf (stream, " simple_io: %s # default: false\n " , params.simple_io ? " true" : " false" );
1273- fprintf (stream, " hot_plug : %s # default: false\n " , params.hot_plug ? " true" : " false" );
1273+ fprintf (stream, " cont_batching : %s # default: false\n " , params.cont_batching ? " true" : " false" );
12741274 fprintf (stream, " temp: %f # default: 0.8\n " , params.temp );
12751275
12761276 const std::vector<float > tensor_split_vector (params.tensor_split , params.tensor_split + LLAMA_MAX_DEVICES);
0 commit comments