Increase input length, reduce batch size (#107)

Yard1 · Yard1 · commit f063f159b591 · 2023-06-03T00:12:44.000Z
Signed-off-by: Antoni Baum &lt;antoni.baum@protonmail.com&gt;
diff --git a/aviary/backend/llm/predictor.py b/aviary/backend/llm/predictor.py
@@ -76,7 +76,7 @@ def init_model(
     # will raise CUDA errors if use_kernel=True.
     batch_size = max_batch_size or 1
     prompt = [WARMUP_PROMPT] * (
-        int(llm_config.max_input_words / (len(WARMUP_PROMPT) + 1)) + 1
+        int(llm_config.max_input_words / (len(WARMUP_PROMPT.split()) + 1)) + 1
     )
     prompt = " ".join(prompt)
     logger.info(
diff --git a/models/amazon--LightGPT.yaml b/models/amazon--LightGPT.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: amazon/LightGPT
+  max_input_words: 800
   initialization:
     runtime_env:
       pip:
@@ -26,9 +27,10 @@ model_config:
       from_pretrained_kwargs:
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 26
+    max_batch_size: 18
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/databricks--dolly-v2-12b.yaml b/models/databricks--dolly-v2-12b.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: databricks/dolly-v2-12b
+  max_input_words: 800
   model_description: "Databricks’ dolly-v2-12b, an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use. Based on pythia-12b, Dolly is trained on ~15k instruction/response fine tuning records databricks-dolly-15k generated by Databricks employees in capability domains from the InstructGPT paper, including brainstorming, classification, closed QA, generation, information extraction, open QA and summarization. dolly-v2-12b is not a state-of-the-art model, but does exhibit surprisingly high quality instruction following behavior not characteristic of the foundation model on which it is based.\n\nDolly v2 is also available in these smaller models sizes:\n\ndolly-v2-7b, a 6.9 billion parameter based on pythia-6.9b\ndolly-v2-3b, a 2.8 billion parameter based on pythia-2.8b\nPlease refer to the dolly GitHub repo for tips on running inference for various GPU configurations."
   initialization:
     s3_mirror_config:
@@ -24,9 +25,10 @@ model_config:
       from_pretrained_kwargs:
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 6
+    max_batch_size: 4
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/h2oai--h2ogpt-oasst1-512-12b.yaml b/models/h2oai--h2ogpt-oasst1-512-12b.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: h2oai/h2ogpt-oasst1-512-12b
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--h2oai--h2ogpt-oasst1-512-12b/main-safetensors/
@@ -24,9 +25,10 @@ model_config:
         trust_remote_code: true
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 6
+    max_batch_size: 4
     generate_kwargs:
       do_sample: true
       num_beams: 1
diff --git a/models/mosaicml--mpt-7b-chat.yaml b/models/mosaicml--mpt-7b-chat.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: mosaicml/mpt-7b-chat
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-chat/main-safetensors/
@@ -29,7 +30,7 @@ model_config:
         mode: max-autotune
     pipeline: default
   generation:
-    max_batch_size: 22
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/mosaicml--mpt-7b-instruct.yaml b/models/mosaicml--mpt-7b-instruct.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: mosaicml/mpt-7b-instruct
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-instruct/main-safetensors/
@@ -29,7 +30,7 @@ model_config:
         mode: max-autotune
     pipeline: default
   generation:
-    max_batch_size: 22
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/mosaicml--mpt-7b-storywriter.yaml b/models/mosaicml--mpt-7b-storywriter.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: mosaicml/mpt-7b-storywriter
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-storywriter/main-safetensors/
@@ -29,7 +30,7 @@ model_config:
         mode: max-autotune
     pipeline: default
   generation:
-    max_batch_size: 12
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/stabilityai--stablelm-tuned-alpha-7b.yaml b/models/stabilityai--stablelm-tuned-alpha-7b.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: stabilityai/stablelm-tuned-alpha-7b
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--stabilityai--stablelm-tuned-alpha-7b/main-safetensors/
@@ -23,9 +24,10 @@ model_config:
       from_pretrained_kwargs:
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 14
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512

Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ def init_model(`
`76`	`76`	`# will raise CUDA errors if use_kernel=True.`
`77`	`77`	`batch_size = max_batch_size or 1`
`78`	`78`	`prompt = [WARMUP_PROMPT] * (`
`79`		`- int(llm_config.max_input_words / (len(WARMUP_PROMPT) + 1)) + 1`
	`79`	`+ int(llm_config.max_input_words / (len(WARMUP_PROMPT.split()) + 1)) + 1`
`80`	`80`	`)`
`81`	`81`	`prompt = " ".join(prompt)`
`82`	`82`	`logger.info(`