parallel : rename hot-plug to continuous-batching

ggerganov · ggerganov · commit 13c8c30769bb · 2023-09-20T09:24:02.000+03:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -372,8 +372,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.multiline_input = true;
         } else if (arg == "--simple-io") {
             params.simple_io = true;
-        } else if (arg == "--hot-plug") {
-            params.hot_plug = true;
+        } else if (arg == "-cb" || arg == "--cont-batching") {
+            params.cont_batching = true;
         } else if (arg == "--color") {
             params.use_color = true;
         } else if (arg == "--mlock") {
@@ -675,7 +675,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
     printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
     printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
-    printf("  --hot-plug            enable hot-plugging of new sequences for decoding (default: disabled)\n");
+    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
     if (llama_mlock_supported()) {
         printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
@@ -1270,7 +1270,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
     fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
     fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
-    fprintf(stream, "hot_plug: %s # default: false\n", params.hot_plug ? "true" : "false");
+    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
     fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
 
     const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
diff --git a/common/common.h b/common/common.h
@@ -110,7 +110,7 @@ struct gpt_params {
     bool interactive_first = false; // wait for user input immediately
     bool multiline_input   = false; // reverse the usage of `\`
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
-    bool hot_plug          = false; // hot-plug new sequences for decoding
+    bool cont_batching     = false; // insert new sequences for decoding on-the-fly
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -86,7 +86,7 @@ int main(int argc, char ** argv) {
     const int32_t n_seq = params.n_sequences;
 
     // insert new requests as soon as the previous one is done
-    const bool hot_plug = params.hot_plug;
+    const bool cont_batching = params.cont_batching;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("parallel", "log"));
@@ -140,7 +140,7 @@ int main(int argc, char ** argv) {
     const auto t_main_start = ggml_time_us();
 
     LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
-    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, hot_plug = %d, system tokens = %d\n", __func__, n_clients, n_seq, hot_plug, n_tokens_system);
+    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
     LOG_TEE("\n");
 
     {
@@ -208,7 +208,7 @@ int main(int argc, char ** argv) {
             }
         }
 
-        if (hot_plug || batch_token.empty()) {
+        if (cont_batching || batch_token.empty()) {
             for (auto & client : clients) {
                 if (client.seq_id == -1 && g_seq_id < n_seq) {
                     client.seq_id = g_seq_id;
@@ -237,9 +237,9 @@ int main(int argc, char ** argv) {
                     client.i_batch   = batch_token.size() - 1;
 
                     g_seq_id += 1;
-                    if (hot_plug) {
-                        //break;
-                    }
+                    //if (cont_batching) {
+                    //    break;
+                    //}
                 }
             }
         }

Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@ int main(int argc, char ** argv) {`
`86`	`86`	`const int32_t n_seq = params.n_sequences;`
`87`	`87`
`88`	`88`	`// insert new requests as soon as the previous one is done`
`89`		`- const bool hot_plug = params.hot_plug;`
	`89`	`+ const bool cont_batching = params.cont_batching;`
`90`	`90`
`91`	`91`	`#ifndef LOG_DISABLE_LOGS`
`92`	`92`	`log_set_target(log_filename_generator("parallel", "log"));`
`@@ -140,7 +140,7 @@ int main(int argc, char ** argv) {`
`140`	`140`	`const auto t_main_start = ggml_time_us();`
`141`	`141`
`142`	`142`	`LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);`
`143`		`- LOG_TEE("%s: n_parallel = %d, n_sequences = %d, hot_plug = %d, system tokens = %d\n", __func__, n_clients, n_seq, hot_plug, n_tokens_system);`
	`143`	`+ LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);`
`144`	`144`	`LOG_TEE("\n");`
`145`	`145`
`146`	`146`	`{`
`@@ -208,7 +208,7 @@ int main(int argc, char ** argv) {`
`208`	`208`	`}`
`209`	`209`	`}`
`210`	`210`
`211`		`- if (hot_plug \|\| batch_token.empty()) {`
	`211`	`+ if (cont_batching \|\| batch_token.empty()) {`
`212`	`212`	`for (auto & client : clients) {`
`213`	`213`	`if (client.seq_id == -1 && g_seq_id < n_seq) {`
`214`	`214`	`client.seq_id = g_seq_id;`
`@@ -237,9 +237,9 @@ int main(int argc, char ** argv) {`
`237`	`237`	`client.i_batch = batch_token.size() - 1;`
`238`	`238`
`239`	`239`	`g_seq_id += 1;`
`240`		`- if (hot_plug) {`
`241`		`- //break;`
`242`		`- }`
	`240`	`+ //if (cont_batching) {`
	`241`	`+ // break;`
	`242`	`+ //}`
`243`	`243`	`}`
`244`	`244`	`}`
`245`	`245`	`}`