add doc

lm-sys · Sep 27, 2024 · d36dc74 · d36dc74
1 parent 2f90685
commit d36dc74
Showing 1 changed file with 66 additions and 0 deletions.
diff --git a/docs/vllm_integration.md b/docs/vllm_integration.md
@@ -23,3 +23,69 @@ See the supported models [here](https://vllm.readthedocs.io/en/latest/models/sup
    '''
    python3 -m fastchat.serve.vllm_worker --model-path TheBloke/vicuna-7B-v1.5-AWQ --quantization awq
    '''
+
+## Add vllm_worker support for lora_modules
+
+### usage
+
+1. start
+
+```bash
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m fastchat.serve.vllm_worker \
+    --model-path /data/models/Qwen/Qwen2-72B-Instruct \
+    --tokenizer /data/models/Qwen/Qwen2-72B-Instruct  \
+    --enable-lora \
+    --lora-modules m1=/data/modules/lora/adapter/m1 m2=/data/modules/lora/adapter/m2 m3=/data/modules/lora/adapter/m3 \
+    --model-names qwen2-72b-instruct,m1,m2,m3\
+    --controller http://localhost:21001 \
+    --host 0.0.0.0 \
+    --num-gpus 8 \
+    --port 31034 \
+    --limit-worker-concurrency 100 \
+    --worker-address http://localhost:31034
+```
+
+1. post
+
+- example1
+
+```bash
+curl --location --request POST 'http://fastchat_address:port/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer sk-xxx' \
+--data-raw '{
+    "model": "m1",
+    "stream": false,
+    "temperature": 0.7,
+    "top_p": 0.1,
+    "max_tokens": 4096,
+    "messages": [
+      {
+        "role": "user",
+        "content": "Hi?"
+      }
+    ]
+  }'
+```
+
+- example2
+
+```bash
+curl --location --request POST 'http://fastchat_address:port/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer sk-xxx' \
+--data-raw '{
+    "model": "qwen2-72b-instruct",
+    "stream": false,
+    "temperature": 0.7,
+    "top_p": 0.1,
+    "max_tokens": 4096,
+    "messages": [
+      {
+        "role": "user",
+        "content": "Hi?"
+      }
+    ]
+  }'
+```