timethink
diff --git a/‎benchmark/dspy/README.md‎
Lines changed: 1 addition & 1 deletion b/‎benchmark/dspy/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/hellaswag/bench_other.py‎
Lines changed: 3 additions & 1 deletion b/‎benchmark/hellaswag/bench_other.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎benchmark/hellaswag/bench_sglang.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmark/hellaswag/bench_sglang.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/json_regex_decode/README.md‎ renamed to ‎benchmark/json_decode_regex/README.md‎
Lines changed: 5 additions & 6 deletions b/‎benchmark/json_regex_decode/README.md‎ renamed to ‎benchmark/json_decode_regex/README.md‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎benchmark/json_regex_decode/bench_other.py‎ renamed to ‎benchmark/json_decode_regex/bench_other.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmark/json_regex_decode/bench_other.py‎ renamed to ‎benchmark/json_decode_regex/bench_other.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/json_regex_decode/bench_sglang.py‎ renamed to ‎benchmark/json_decode_regex/bench_sglang.py‎
Lines changed: 1 addition & 3 deletions b/‎benchmark/json_regex_decode/bench_sglang.py‎ renamed to ‎benchmark/json_decode_regex/bench_sglang.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎benchmark/json_regex_decode/build_dataset.py‎ renamed to ‎benchmark/json_decode_regex/build_dataset.py‎ b/‎benchmark/json_regex_decode/build_dataset.py‎ renamed to ‎benchmark/json_decode_regex/build_dataset.py‎
diff --git a/‎benchmark/latency_throughput/README.md‎
Lines changed: 1 addition & 14 deletions b/‎benchmark/latency_throughput/README.md‎
Lines changed: 1 addition & 14 deletions
diff --git a/‎benchmark/llm_judge/articles.jsonl‎
Lines changed: 25 additions & 0 deletions b/‎benchmark/llm_judge/articles.jsonl‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎benchmark/mmlu/bench_other.py‎
Lines changed: 3 additions & 0 deletions b/‎benchmark/mmlu/bench_other.py‎
Lines changed: 3 additions & 0 deletions
@@ -23,7 +23,7 @@ python3 bench_dspy_intro.py --backend sglang
 ```
 docker run --name tgi --rm -ti --gpus all --network host \
   -v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
-  ghcr.io/huggingface/text-generation-inference:1.1.0 \
+  ghcr.io/huggingface/text-generation-inference:1.3.0 \
   --model-id /Llama-2-7b-chat-hf --num-shard 1  --trust-remote-code \
   --max-input-length 2048 --max-total-tokens 4096 \
   --port 24000
 
@@ -57,6 +57,8 @@ def call_select(context, choices):
             out = model + context + select(choices, name="answer")
             return choices.index(out["answer"])
 
+        call_select("Hello,", ["world", "earth"])
+
     elif args.backend == "lmql":
         import lmql
         model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
@@ -135,6 +137,6 @@ async def batched_call(batch_size):
     parser = argparse.ArgumentParser()
     parser.add_argument("--num-shot", type=int, default=20)
     parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
-    parser.add_argument("--num-questions", type=int, default=100)
+    parser.add_argument("--num-questions", type=int, default=200)
     args = add_common_other_args_and_parse(parser)
     main(args)
@@ -91,6 +91,6 @@ def few_shot_hellaswag(s, question, choices):
     parser = argparse.ArgumentParser()
     parser.add_argument("--num-shot", type=int, default=20)
     parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
-    parser.add_argument("--num-questions", type=int, default=100)
+    parser.add_argument("--num-questions", type=int, default=200)
     args = add_common_sglang_args_and_parse(parser)
     main(args)
@@ -17,14 +17,13 @@ outlines                  0.0.22
 
 ### Benchmark sglang
 
-Run llama-7b
+Run Llama-7B
 
 ```
 python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 
 ```
 
-Run mixtral-8x7b
-(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
+Run Mixtral-8x7B
 
 ```
 python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
@@ -39,7 +38,7 @@ python3 bench_sglang.py --num-questions 10
 
 ### Benchmark vllm
 
-Run llama-7b
+Run Llama-7B
 
 ```
 python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf  --disable-log-requests --port 21000
@@ -54,8 +53,8 @@ python3 bench_other.py --backend vllm --num-questions 10
 
 ### Benchmark guidance
 
-Run llama-7b and benchmark
+Run Llama-7B and benchmark
 
 ```
 python3 bench_other.py --backend guidance --num-questions 10 --parallel 1
-```
+```
@@ -105,7 +105,7 @@ def get_one_answer(i):
 
     with open(args.result_file, "a") as fout:
         value = {
-            "task": "json_regex_decode",
+            "task": "json_decode_regex",
             "backend": args.backend,
             "num_gpus": 1,
             "latency": round(latency, 3),
 
@@ -64,8 +64,6 @@ def main(args):
     # Run requests
     tic = time.time()
     states = json_decode.run_batch(arguments, temperature=0, num_threads=args.parallel)
-    for state in states:
-        state.sync()
     latency = time.time() - tic
 
     # Compute accuracy
@@ -80,7 +78,7 @@ def main(args):
 
     with open(args.result_file, "a") as fout:
         value = {
-            "task": "json_regex_decode",
+            "task": "json_decode_regex",
             "backend": args.backend,
             "num_gpus": 1,
             "latency": round(latency, 3),
 
@@ -3,19 +3,6 @@
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
 
-### Performance
-
-- Model: Llama-2-7b-chat-hf
-- `--num-prompts 2000 --request-rate 200`
-- On 4 A10 (24G) GPUs
-
-| Backend     | Throughput      | Latency  |
-| ----------- | --------------- | -------- |
-| srt         | 5.82 requests/s | 343.54 s |
-| vllm==0.2.6 | 3.93 requests/s | 509.08 s |
-| vllm==0.2.7 | 5.02 requests/s | 398.25 s |
-
- 
 ### SGLang
 ```
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
@@ -28,7 +15,7 @@ python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat
 
 ### vLLM
 ```
-python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16
+python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16 --port 21000
 ```
 
 ```
 
@@ -95,6 +95,9 @@ def call_generate(prompt, temperature, max_tokens):
                 max_tokens=max_tokens, temperature=0)
             return out["answer"]
 
+        # warmup
+        call_generate("Hello,", temperature=1.0, max_tokens=8)
+
     elif args.backend == "lmql":
         import lmql
         model = lmql.model("meta-llama/Llama-2-7b-chat-hf",