diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 5af232cb6af6..d5fcdd5b2564 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -1,18 +1,50 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import json +import pathlib +import itertools + +from tqdm import tqdm from transformers import AutoTokenizer from vllm import LLM, SamplingParams from vllm.benchmarks.datasets import add_dataset_parser, get_samples from vllm.inputs import TokensPrompt -from vllm.v1.metrics.reader import Counter, Vector +from vllm.v1.metrics.reader import Counter, Vector, Histogram try: from vllm.utils import FlexibleArgumentParser except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +# create output directory +from datetime import datetime +outputs_dir = pathlib.Path("outputs/") / datetime.now().strftime("%Y%m%d_%H%M%S") +outputs_dir.mkdir(parents=True, exist_ok=True) +(outputs_dir / "drafter.csv").touch() +(outputs_dir / "target.csv").touch() + +def read_stats(path): + forward_times, shapes = [], [] + with open(path, 'r') as f: + for line in f: + parts = line.strip().split(',') + forward_times.append(float(parts[0])) + shapes.append(parts[1]) + return forward_times, shapes + +def print_dict(stats, file=None, newlines=[]): + if file is None: + for i, (k, v) in enumerate(stats.items()): + print(f"{k:<50}{v}") + if i in newlines: print() + else: + file.touch() + with open(file, 'a') as f: + for k, v in stats.items(): + f.write(json.dumps({k: v}) + '\n') QUESTION = "What is the content of each image?" IMAGE_URLS = [ @@ -45,6 +77,31 @@ def get_custom_mm_prompts(num_prompts): return [[{"role": "user", "content": prompt}] for prompt in prompts[:num_prompts]] +def multiturn_inference(llm, sampling_params, num_prompts): + from datasets import load_dataset + ds = load_dataset("philschmid/mt-bench", split="train") + + outputs = [] + total_samples = min(sum([len(data["turns"]) for data in ds]), num_prompts if num_prompts is not None else float('inf')) + print(f'Running on {total_samples} samples.') + + for i, data in tqdm(enumerate(ds), total=total_samples): + if i >= total_samples: break + messages = [ + {"role": "system", + "content": "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."}, + ] + for i in range(len(data["turns"])): + qs = data["turns"][i] + messages.append({"role": "user", "content": qs}) + output = llm.chat(messages, sampling_params=sampling_params, use_tqdm=False)[0] + outputs.append(output) + messages.append({ + "role": "assistant", + "content": output.outputs[0].text + }) + return outputs + def parse_args(): parser = FlexibleArgumentParser() @@ -56,6 +113,9 @@ def parse_args(): choices=["ngram", "eagle", "eagle3", "mtp"], ) parser.add_argument("--num-spec-tokens", type=int, default=2) + parser.add_argument("--spec-token-tree", type=str, default=None) + parser.add_argument("--spec-token-tree-depth", type=int, default=None) + parser.add_argument("--spec-token-tree-branching", type=int, default=None) parser.add_argument("--prompt-lookup-max", type=int, default=5) parser.add_argument("--prompt-lookup-min", type=int, default=2) parser.add_argument("--tp", type=int, default=1) @@ -65,13 +125,16 @@ def parse_args(): parser.add_argument("--top-p", type=float, default=1.0) parser.add_argument("--top-k", type=int, default=-1) parser.add_argument("--print-output", action="store_true") + parser.add_argument("--max-num-seqs", type=int, default=None) parser.add_argument("--output-len", type=int, default=256) parser.add_argument("--model-dir", type=str, default=None) parser.add_argument("--eagle-dir", type=str, default=None) parser.add_argument("--custom-mm-prompts", action="store_true") + parser.add_argument("--draft-vocab-frequency-path", type=str, default=None) + parser.add_argument("--draft-vocab-frequency-keep-threshold", type=str, default=None) + parser.add_argument("--compilation-config", type=str, default="") return parser.parse_args() - def main(): args = parse_args() args.endpoint_type = "openai-chat" @@ -88,28 +151,65 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_dir) args.custom_skip_chat_template = True - if not args.custom_mm_prompts: - prompts = get_samples(args, tokenizer) - # add_special_tokens is False to avoid adding bos twice - # when using chat templates - prompt_ids = [ - tokenizer.encode(prompt.prompt, add_special_tokens=False) - for prompt in prompts - ] - else: - prompts = get_custom_mm_prompts(args.num_prompts) - if args.method == "eagle" or args.method == "eagle3": - eagle_dir = args.eagle_dir - if args.method == "eagle" and eagle_dir is None: - eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" + # if not args.custom_mm_prompts: + # prompts = get_samples(args, tokenizer) + # # add_special_tokens is False to avoid adding bos twice + # # when using chat templates + # prompt_ids = [ + # tokenizer.encode(prompt.prompt, add_special_tokens=False) + # for prompt in prompts + # ] + # else: + # prompts = get_custom_mm_prompts(args.num_prompts) + + # manually specify the speculative token tree + if args.spec_token_tree is not None: + assert args.spec_token_tree_depth is None and args.spec_token_tree_branching is None, \ + "If using spec_token_tree, cannot also use spec token tree depth+branching" + spec_token_tree = ast.literal_eval(args.spec_token_tree) + assert args.num_spec_tokens == len(spec_token_tree), f'expected `len(spec_token_tree) == num_spec_tokens` but got {len(spec_token_tree)=} and {args.num_spec_tokens=}' + spec_token_tree_str = str(sorted(spec_token_tree, key=lambda t: (len(t), t))) + # construct a complete speculative token tree from depth, branch args + elif args.spec_token_tree_depth is not None or args.spec_token_tree_branching is not None and not (args.spec_token_tree_depth is None and args.spec_token_tree_branching is None): + assert args.spec_token_tree is None, "If using spec token tree depth+branching, cannot also use spec_token_tree" + if args.spec_token_tree_depth is None: args.spec_token_tree_depth = 1 + if args.spec_token_tree_branching is None: args.spec_token_tree_branching = 1 + spec_token_tree = [] + depth, branching = args.spec_token_tree_depth, args.spec_token_tree_branching + for d in range(1, depth + 1): + for path in itertools.product(range(branching), repeat=d): + spec_token_tree.append(path) + if args.num_spec_tokens is None: + args.num_spec_tokens = len(spec_token_tree) + print(spec_token_tree) + assert args.num_spec_tokens == len(spec_token_tree), f'expected `len(spec_token_tree) == num_spec_tokens` but got {len(spec_token_tree)=} and {args.num_spec_tokens=}' + spec_token_tree_str = str(sorted(spec_token_tree, key=lambda t: (len(t), t))) + else: + spec_token_tree_str = None + ic(args.num_spec_tokens, spec_token_tree_str) - elif args.method == "eagle3" and eagle_dir is None: - eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" + # vanilla inference if num_spec_tokens == 0 + if args.num_spec_tokens == 0: + speculative_config = None + print('Ignore speculative decoding when `args.num_spec_tokens == 0`.') + elif args.method == "eagle": + eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" if args.eagle_dir is None else args.eagle_dir + speculative_config = { + "method": args.method, + "model": eagle_dir, + "num_speculative_tokens": args.num_spec_tokens, + "spec_token_tree": spec_token_tree_str, + "draft_vocab_frequency_path": args.draft_vocab_frequency_path, + "draft_vocab_frequency_keep_threshold": args.draft_vocab_frequency_keep_threshold, + } + elif args.method == "eagle3": + eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" if args.eagle_dir is None else args.eagle_dir speculative_config = { "method": args.method, "model": eagle_dir, "num_speculative_tokens": args.num_spec_tokens, + "spec_token_tree": spec_token_tree_str, } elif args.method == "ngram": speculative_config = { @@ -121,6 +221,9 @@ def main(): else: raise ValueError(f"unknown method: {args.method}") + # save args + print_dict({str(k): str(v) for k, v in vars(args).items()}, outputs_dir / "args.jsonl") + llm = LLM( model=model_dir, trust_remote_code=True, @@ -130,27 +233,51 @@ def main(): gpu_memory_utilization=0.8, speculative_config=speculative_config, disable_log_stats=False, - max_model_len=16384, + max_model_len=8192, + seed=0, + max_num_seqs=args.max_num_seqs, limit_mm_per_prompt={"image": 5}, disable_chunked_mm_input=True, + compilation_config=( + json.loads(args.compilation_config) if args.compilation_config else None + ), ) + # print out batch size + scheduler_config = llm.llm_engine.vllm_config.scheduler_config + ic(scheduler_config.max_num_seqs, scheduler_config.max_num_batched_tokens, scheduler_config.max_model_len) + sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len) - if not args.custom_mm_prompts: - outputs = llm.generate( - [TokensPrompt(prompt_token_ids=x) for x in prompt_ids], - sampling_params=sampling_params, - ) - else: - outputs = llm.chat(prompts, sampling_params=sampling_params) + # if not args.custom_mm_prompts: + # outputs = llm.generate( + # [TokensPrompt(prompt_token_ids=x) for x in prompt_ids], + # sampling_params=sampling_params, + # ) + # else: + # outputs = llm.chat(prompts, sampling_params=sampling_params) + + # perform multi-turn inference with max-num-seqs=1 + assert args.max_num_seqs == 1 + outputs = multiturn_inference(llm, sampling_params, args.num_prompts) + + # import Counter in the function b/c vllm has a seperate Counter object + def get_finish_reason_counts(outputs): + from collections import Counter + finish_reasons = [output.outputs[0].finish_reason for output in outputs] + return Counter(finish_reasons) + finish_reason_counts = get_finish_reason_counts(outputs) + print(f"Finish Reasons: {finish_reason_counts}") # print the generated text if args.print_output: - for output in outputs: - print("-" * 50) - print(f"prompt: {output.prompt}") - print(f"generated text: {output.outputs[0].text}") - print("-" * 50) + for i, output in enumerate(outputs): + prompt = tokenizer.decode(output.prompt_token_ids) + print("*" * 150) + print(f"Output {i}:") + print(f"---Finish reason---\n{output.outputs[0].finish_reason}") + print(f"---Prompt ({len(output.prompt_token_ids)} tokens)---\n{prompt}") + print(f"---Generated Text ({len(output.outputs[0].token_ids)})---\n{output.outputs[0].text}") + print("*" * 150 + '\n') try: metrics = llm.get_metrics() @@ -158,41 +285,73 @@ def main(): print("Metrics are not supported in the V0 engine.") return - total_num_output_tokens = sum( - len(output.outputs[0].token_ids) for output in outputs - ) - num_drafts = 0 - num_draft_tokens = 0 - num_accepted_tokens = 0 + output_tokens = sum(len(output.outputs[0].token_ids) for output in outputs) + input_time = 0.0 + output_time = 0.0 + drafts = 0 + draft_tokens = 0 + accepted_tokens = 0 + input_tokens = 0 + requests = 0 acceptance_counts = [0] * args.num_spec_tokens + for metric in metrics: if metric.name == "vllm:spec_decode_num_drafts": assert isinstance(metric, Counter) - num_drafts += metric.value + drafts += metric.value elif metric.name == "vllm:spec_decode_num_draft_tokens": assert isinstance(metric, Counter) - num_draft_tokens += metric.value + draft_tokens += metric.value elif metric.name == "vllm:spec_decode_num_accepted_tokens": assert isinstance(metric, Counter) - num_accepted_tokens += metric.value + accepted_tokens += metric.value elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos": assert isinstance(metric, Vector) for pos in range(len(metric.values)): acceptance_counts[pos] += metric.values[pos] + elif metric.name == "vllm:prompt_tokens": + assert isinstance(metric, Counter) + input_tokens += metric.value + elif metric.name == "vllm:request_prefill_time_seconds": + assert isinstance(metric, Histogram) + input_time += metric.sum + elif metric.name == "vllm:request_decode_time_seconds": + assert isinstance(metric, Histogram) + output_time += metric.sum + elif metric.name == "vllm:request_success": + assert isinstance(metric, Counter) + requests += metric.value + + # Calculate metrics + tokens = input_tokens + output_tokens + total_time = input_time + output_time # measured in seconds + + input_throughput = input_tokens / input_time if input_time > 0 else 0 + output_throughput = output_tokens / output_time if output_time > 0 else 0 + total_throughput = tokens / total_time + + mean_acceptance_length = 1 + (accepted_tokens / drafts) if drafts > 0 else 1 + draft_utilization_rate = accepted_tokens / draft_tokens * 100 if draft_tokens > 0 else 0 + + drafter_forward_times, _ = read_stats(outputs_dir / "drafter.csv") + target_forward_times, _ = read_stats(outputs_dir / "target.csv") + + drafter_forward_time = sum(drafter_forward_times) + target_forward_time = sum(target_forward_times) + forward_ratio = drafter_forward_time / target_forward_time if target_forward_time > 0 else 0 + + stats = { + "input_tokens": input_tokens, "output_tokens": output_tokens, + "input_time": input_time, "output_time": output_time, "total_time": total_time, + "drafter_forward_time": drafter_forward_time, "target_forward_time": target_forward_time, "forward_ratio": forward_ratio, + "input_throughput": input_throughput, "output_throughput": output_throughput, "total_throughput": total_throughput, + "drafts": drafts, "draft_tokens": draft_tokens, "draft_utilization_rate": draft_utilization_rate, + "accepted_tokens": accepted_tokens, "mean_acceptance_length": mean_acceptance_length + } - print("-" * 50) - print(f"total_num_output_tokens: {total_num_output_tokens}") - print(f"num_drafts: {num_drafts}") - print(f"num_draft_tokens: {num_draft_tokens}") - print(f"num_accepted_tokens: {num_accepted_tokens}") - acceptance_length = 1 + (num_accepted_tokens / num_drafts) if num_drafts > 0 else 1 - print(f"mean acceptance length: {acceptance_length:.2f}") - print("-" * 50) - - # print acceptance at each token position - for i in range(len(acceptance_counts)): - acceptance_rate = acceptance_counts[i] / num_drafts if num_drafts > 0 else 0 - print(f"acceptance at token {i}: {acceptance_rate:.2f}") + # print stats to stdout and save to file + print_dict(stats, newlines=[1, 4, 7, 10, 13, 16]) + print_dict(stats, file=outputs_dir / "stats.jsonl") if __name__ == "__main__": diff --git a/outputs/20250919_175633/args.jsonl b/outputs/20250919_175633/args.jsonl new file mode 100644 index 000000000000..59a101cd123e --- /dev/null +++ b/outputs/20250919_175633/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_175633/stats.jsonl b/outputs/20250919_175633/stats.jsonl new file mode 100644 index 000000000000..2384a37788b2 --- /dev/null +++ b/outputs/20250919_175633/stats.jsonl @@ -0,0 +1,19 @@ +{"input_tokens": 9468} +{"output_tokens": 21363} +{"input_time": 6.011072126999466} +{"output_time": 452.5689067610002} +{"total_time": 458.5799788879997} +{"drafter_prefill_forward_time": 0.000760874999969019} +{"target_prefill_forward_time": 0.02968344500004605} +{"prefill_forward_ratio": 0.025632974877674696} +{"drafter_decode_forward_time": 7.910397590991124} +{"target_decode_forward_time": 172.45367566900495} +{"decode_forward_ratio": 0.04586969550114876} +{"input_throughput": 1575.093394317017} +{"output_throughput": 47.203861513362234} +{"total_throughput": 67.2314567128757} +{"drafts": 12482} +{"draft_tokens": 12482} +{"draft_utilization_rate": 70.70181060727447} +{"accepted_tokens": 8825} +{"acceptance_length": 1.7070181060727447} diff --git a/outputs/20250919_181208/args.jsonl b/outputs/20250919_181208/args.jsonl new file mode 100644 index 000000000000..59a101cd123e --- /dev/null +++ b/outputs/20250919_181208/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_181227/args.jsonl b/outputs/20250919_181227/args.jsonl new file mode 100644 index 000000000000..ea22b9da4af9 --- /dev/null +++ b/outputs/20250919_181227/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "3"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_181518/args.jsonl b/outputs/20250919_181518/args.jsonl new file mode 100644 index 000000000000..ea22b9da4af9 --- /dev/null +++ b/outputs/20250919_181518/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "3"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_181951/args.jsonl b/outputs/20250919_181951/args.jsonl new file mode 100644 index 000000000000..4a98c1abda36 --- /dev/null +++ b/outputs/20250919_181951/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_182203/args.jsonl b/outputs/20250919_182203/args.jsonl new file mode 100644 index 000000000000..4a98c1abda36 --- /dev/null +++ b/outputs/20250919_182203/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_190500/args.jsonl b/outputs/20250919_190500/args.jsonl new file mode 100644 index 000000000000..59a101cd123e --- /dev/null +++ b/outputs/20250919_190500/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_191536/args.jsonl b/outputs/20250919_191536/args.jsonl new file mode 100644 index 000000000000..621df86ca381 --- /dev/null +++ b/outputs/20250919_191536/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "eturok/llama-3.1-8b-instruct-vocab-freq/vocab_freq.pt"} +{"draft_vocab_frequency_keep_threshold": "0.25"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_191922/args.jsonl b/outputs/20250919_191922/args.jsonl new file mode 100644 index 000000000000..97dfab0c2305 --- /dev/null +++ b/outputs/20250919_191922/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_192224/args.jsonl b/outputs/20250919_192224/args.jsonl new file mode 100644 index 000000000000..97dfab0c2305 --- /dev/null +++ b/outputs/20250919_192224/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_192342/args.jsonl b/outputs/20250919_192342/args.jsonl new file mode 100644 index 000000000000..97dfab0c2305 --- /dev/null +++ b/outputs/20250919_192342/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_192429/args.jsonl b/outputs/20250919_192429/args.jsonl new file mode 100644 index 000000000000..97dfab0c2305 --- /dev/null +++ b/outputs/20250919_192429/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_192524/args.jsonl b/outputs/20250919_192524/args.jsonl new file mode 100644 index 000000000000..97dfab0c2305 --- /dev/null +++ b/outputs/20250919_192524/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_193447/args.jsonl b/outputs/20250919_193447/args.jsonl new file mode 100644 index 000000000000..97dfab0c2305 --- /dev/null +++ b/outputs/20250919_193447/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_201658/args.jsonl b/outputs/20250919_201658/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_201658/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_201850/args.jsonl b/outputs/20250919_201850/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_201850/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_201850/stats.jsonl b/outputs/20250919_201850/stats.jsonl new file mode 100644 index 000000000000..41eea1ec81a5 --- /dev/null +++ b/outputs/20250919_201850/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 265} +{"output_tokens": 469} +{"input_time": 0.1720078110029135} +{"output_time": 10.268477026997061} +{"total_time": 10.440484837999975} +{"drafter_forward_time": 0.18030722998446436} +{"target_forward_time": 3.94027743398874} +{"forward_ratio": 0.045760034161335554} +{"input_throughput": 1540.627710188762} +{"output_throughput": 45.67376435346182} +{"total_throughput": 70.30324849747191} +{"drafts": 290} +{"draft_tokens": 290} +{"draft_utilization_rate": 61.03448275862069} +{"accepted_tokens": 177} +{"acceptance_length": 1.610344827586207} diff --git a/outputs/20250919_203148/args.jsonl b/outputs/20250919_203148/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_203148/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_203508/args.jsonl b/outputs/20250919_203508/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_203508/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_203508/stats.jsonl b/outputs/20250919_203508/stats.jsonl new file mode 100644 index 000000000000..75b86755d2c1 --- /dev/null +++ b/outputs/20250919_203508/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 265} +{"output_tokens": 469} +{"input_time": 0.171807644999717} +{"output_time": 10.278211488001034} +{"total_time": 10.450019133000751} +{"drafter_forward_time": 0.18388141198738595} +{"target_forward_time": 4.033385653006917} +{"forward_ratio": 0.04558984134093428} +{"input_throughput": 1542.4226320105633} +{"output_throughput": 45.63050687831428} +{"total_throughput": 70.23910584833828} +{"drafts": 290} +{"draft_tokens": 290} +{"draft_utilization_rate": 61.03448275862069} +{"accepted_tokens": 177} +{"acceptance_length": 1.610344827586207} diff --git a/outputs/20250919_203632/args.jsonl b/outputs/20250919_203632/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_203632/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_203632/stats.jsonl b/outputs/20250919_203632/stats.jsonl new file mode 100644 index 000000000000..17c1357e5fc7 --- /dev/null +++ b/outputs/20250919_203632/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 265} +{"output_tokens": 469} +{"input_time": 0.1717779500013421} +{"output_time": 10.274111790000461} +{"total_time": 10.445889740001803} +{"drafter_forward_time": 0.18041177299710398} +{"target_forward_time": 3.9834033149745665} +{"forward_ratio": 0.04529086279536218} +{"input_throughput": 1542.6892683137128} +{"output_throughput": 45.64871490462719} +{"total_throughput": 70.26687225973662} +{"drafts": 290} +{"draft_tokens": 290} +{"draft_utilization_rate": 61.03448275862069} +{"accepted_tokens": 177} +{"acceptance_length": 1.610344827586207} diff --git a/outputs/20250919_205025/args.jsonl b/outputs/20250919_205025/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_205025/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_205025/stats.jsonl b/outputs/20250919_205025/stats.jsonl new file mode 100644 index 000000000000..3866bf662076 --- /dev/null +++ b/outputs/20250919_205025/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 265} +{"output_tokens": 469} +{"input_time": 0.16604826000002504} +{"output_time": 10.265857733998928} +{"total_time": 10.431905993998953} +{"drafter_forward_time": 0.18097098000362166} +{"target_forward_time": 3.9934722519938077} +{"forward_ratio": 0.045316698998789554} +{"input_throughput": 1595.9215712345317} +{"output_throughput": 45.6854178337914} +{"total_throughput": 70.36106349330986} +{"drafts": 290} +{"draft_tokens": 290} +{"draft_utilization_rate": 61.03448275862069} +{"accepted_tokens": 177} +{"acceptance_length": 1.610344827586207} diff --git a/outputs/20250919_205113/args.jsonl b/outputs/20250919_205113/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_205113/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_205113/stats.jsonl b/outputs/20250919_205113/stats.jsonl new file mode 100644 index 000000000000..0456cb46f854 --- /dev/null +++ b/outputs/20250919_205113/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 265} +{"output_tokens": 469} +{"input_time": 0.17269437200047832} +{"output_time": 10.257325591999688} +{"total_time": 10.430019964000167} +{"drafter_forward_time": 0.18194107699673623} +{"target_forward_time": 4.016665271987222} +{"forward_ratio": 0.04529654942014173} +{"input_throughput": 1534.5028151772428} +{"output_throughput": 45.723419403377584} +{"total_throughput": 70.37378667859166} +{"drafts": 290} +{"draft_tokens": 290} +{"draft_utilization_rate": 61.03448275862069} +{"accepted_tokens": 177} +{"acceptance_length": 1.610344827586207} diff --git a/outputs/20250919_205257/args.jsonl b/outputs/20250919_205257/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_205257/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_205257/stats.jsonl b/outputs/20250919_205257/stats.jsonl new file mode 100644 index 000000000000..b76d4981c4bd --- /dev/null +++ b/outputs/20250919_205257/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 265} +{"output_tokens": 469} +{"input_time": 0.16813918700063368} +{"output_time": 10.266687685998477} +{"total_time": 10.43482687299911} +{"drafter_forward_time": 0.18296925398499297} +{"target_forward_time": 3.9790807840054185} +{"forward_ratio": 0.04598279449878689} +{"input_throughput": 1576.0751834669052} +{"output_throughput": 45.68172465590959} +{"total_throughput": 70.34136827888152} +{"drafts": 290} +{"draft_tokens": 290} +{"draft_utilization_rate": 61.03448275862069} +{"accepted_tokens": 177} +{"acceptance_length": 1.610344827586207} diff --git a/outputs/20250919_205505/args.jsonl b/outputs/20250919_205505/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_205505/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_205505/stats.jsonl b/outputs/20250919_205505/stats.jsonl new file mode 100644 index 000000000000..eb09cc67948c --- /dev/null +++ b/outputs/20250919_205505/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 265} +{"output_tokens": 469} +{"input_time": 0.17133993300012662} +{"output_time": 10.26988108399928} +{"total_time": 10.441221016999407} +{"drafter_forward_time": 0.18258203798905015} +{"target_forward_time": 3.971497681000983} +{"forward_ratio": 0.04597309444809543} +{"input_throughput": 1546.6330315408968} +{"output_throughput": 45.66752001936159} +{"total_throughput": 70.29829162748022} +{"drafts": 290} +{"draft_tokens": 290} +{"draft_utilization_rate": 61.03448275862069} +{"accepted_tokens": 177} +{"acceptance_length": 1.610344827586207} diff --git a/outputs/20250919_205712/args.jsonl b/outputs/20250919_205712/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_205712/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_205805/args.jsonl b/outputs/20250919_205805/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_205805/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_205904/args.jsonl b/outputs/20250919_205904/args.jsonl new file mode 100644 index 000000000000..9bfba6c9e90a --- /dev/null +++ b/outputs/20250919_205904/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_205904/stats.jsonl b/outputs/20250919_205904/stats.jsonl new file mode 100644 index 000000000000..c1e29e6c36f1 --- /dev/null +++ b/outputs/20250919_205904/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 265} +{"output_tokens": 469} +{"input_time": 0.17122334499981662} +{"output_time": 10.280629928000053} +{"total_time": 10.45185327299987} +{"drafter_forward_time": 0.18158310000944766} +{"target_forward_time": 4.010394985001767} +{"forward_ratio": 0.045278108687184} +{"input_throughput": 1547.6861522608604} +{"output_throughput": 45.619772648623794} +{"total_throughput": 70.22677996218452} +{"drafts": 290} +{"draft_tokens": 290} +{"draft_utilization_rate": 61.03448275862069} +{"accepted_tokens": 177} +{"acceptance_length": 1.610344827586207} diff --git a/outputs/20250919_210308/args.jsonl b/outputs/20250919_210308/args.jsonl new file mode 100644 index 000000000000..97dfab0c2305 --- /dev/null +++ b/outputs/20250919_210308/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_210308/stats.jsonl b/outputs/20250919_210308/stats.jsonl new file mode 100644 index 000000000000..3d353c5b0680 --- /dev/null +++ b/outputs/20250919_210308/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 9468} +{"output_tokens": 21363} +{"input_time": 6.0327107980047} +{"output_time": 449.70677139800137} +{"total_time": 455.73948219600607} +{"drafter_forward_time": 7.855270809979629} +{"target_forward_time": 173.12172989094506} +{"forward_ratio": 0.04537426246218725} +{"input_throughput": 1569.44370731836} +{"output_throughput": 47.50428803548797} +{"total_throughput": 67.65049157347332} +{"drafts": 12774} +{"draft_tokens": 12774} +{"draft_utilization_rate": 66.6431814623454} +{"accepted_tokens": 8513} +{"acceptance_length": 1.6664318146234538} diff --git a/outputs/20250919_212530/args.jsonl b/outputs/20250919_212530/args.jsonl new file mode 100644 index 000000000000..abbc70c2b57d --- /dev/null +++ b/outputs/20250919_212530/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "256"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_212530/stats.jsonl b/outputs/20250919_212530/stats.jsonl new file mode 100644 index 000000000000..9826c3164ffc --- /dev/null +++ b/outputs/20250919_212530/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 9468} +{"output_tokens": 21560} +{"input_time": 208.67358455207432} +{"output_time": 1594.6060836229972} +{"total_time": 1803.2796681750715} +{"drafter_forward_time": 0.12754699501601863} +{"target_forward_time": 2.417733884998597} +{"forward_ratio": 0.05275476999657166} +{"input_throughput": 45.37229769797369} +{"output_throughput": 13.520580550536327} +{"total_throughput": 17.20642701606041} +{"drafts": 12895} +{"draft_tokens": 12895} +{"draft_utilization_rate": 66.52966265994571} +{"accepted_tokens": 8579} +{"acceptance_length": 1.6652966265994573} diff --git a/outputs/20250919_212826/args.jsonl b/outputs/20250919_212826/args.jsonl new file mode 100644 index 000000000000..9a11e1221321 --- /dev/null +++ b/outputs/20250919_212826/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "256"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_212826/stats.jsonl b/outputs/20250919_212826/stats.jsonl new file mode 100644 index 000000000000..c33184a99a4b --- /dev/null +++ b/outputs/20250919_212826/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 9468} +{"output_tokens": 21552} +{"input_time": 214.9669461459398} +{"output_time": 1584.70794981998} +{"total_time": 1799.6748959659199} +{"drafter_forward_time": 0.15442322700255318} +{"target_forward_time": 2.4258221139989473} +{"forward_ratio": 0.06365810011847398} +{"input_throughput": 44.04398057351678} +{"output_throughput": 13.599982256950417} +{"total_throughput": 17.23644646571067} +{"drafts": 12599} +{"draft_tokens": 12599} +{"draft_utilization_rate": 70.52940709580126} +{"accepted_tokens": 8886} +{"acceptance_length": 1.7052940709580127} diff --git a/outputs/20250919_212947/args.jsonl b/outputs/20250919_212947/args.jsonl new file mode 100644 index 000000000000..edfb0ab01efa --- /dev/null +++ b/outputs/20250919_212947/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "0"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "256"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20250919_212947/stats.jsonl b/outputs/20250919_212947/stats.jsonl new file mode 100644 index 000000000000..1bbac6618e99 --- /dev/null +++ b/outputs/20250919_212947/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 9468} +{"output_tokens": 21520} +{"input_time": 170.07860098089077} +{"output_time": 1528.9547078100368} +{"total_time": 1699.0333087909275} +{"drafter_forward_time": 0} +{"target_forward_time": 4.153133364003224} +{"forward_ratio": 0.0} +{"input_throughput": 55.668378887146304} +{"output_throughput": 14.074975465312297} +{"total_throughput": 18.238606529763562} +{"drafts": 0} +{"draft_tokens": 0} +{"draft_utilization_rate": 0} +{"accepted_tokens": 0} +{"acceptance_length": 1} diff --git a/outputs/20251009_065435/args.jsonl b/outputs/20251009_065435/args.jsonl new file mode 100644 index 000000000000..9a9aa806bd47 --- /dev/null +++ b/outputs/20251009_065435/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2048"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "2"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_070051/args.jsonl b/outputs/20251009_070051/args.jsonl new file mode 100644 index 000000000000..9a9aa806bd47 --- /dev/null +++ b/outputs/20251009_070051/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2048"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "2"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_074541/args.jsonl b/outputs/20251009_074541/args.jsonl new file mode 100644 index 000000000000..c7aa57dfba91 --- /dev/null +++ b/outputs/20251009_074541/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "2"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_074727/args.jsonl b/outputs/20251009_074727/args.jsonl new file mode 100644 index 000000000000..c7aa57dfba91 --- /dev/null +++ b/outputs/20251009_074727/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "2"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_074727/stats.jsonl b/outputs/20251009_074727/stats.jsonl new file mode 100644 index 000000000000..5d3d3f52f1eb --- /dev/null +++ b/outputs/20251009_074727/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 20703} +{"input_time": 1.9072829750029996} +{"output_time": 149.7024177399985} +{"total_time": 151.6097007150015} +{"drafter_forward_time": 9.537363458992786} +{"target_forward_time": 73.43282287100283} +{"forward_ratio": 0.1298787529351388} +{"input_throughput": 3653.36454596573} +{"output_throughput": 138.29435965394185} +{"total_throughput": 182.51470631167868} +{"drafts": 11207} +{"draft_tokens": 22414} +{"draft_utilization_rate": 42.15668778442045} +{"accepted_tokens": 9449} +{"mean_acceptance_length": 1.843133755688409} diff --git a/outputs/20251009_075410/args.jsonl b/outputs/20251009_075410/args.jsonl new file mode 100644 index 000000000000..0699b28d68a5 --- /dev/null +++ b/outputs/20251009_075410/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "2"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_075410/stats.jsonl b/outputs/20251009_075410/stats.jsonl new file mode 100644 index 000000000000..59c137dcd8dd --- /dev/null +++ b/outputs/20251009_075410/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 20703} +{"input_time": 1.7776621990042258} +{"output_time": 148.36109535099877} +{"total_time": 150.138757550003} +{"drafter_forward_time": 9.101905309900758} +{"target_forward_time": 69.77205747404605} +{"forward_ratio": 0.1304520124447599} +{"input_throughput": 3919.754835256772} +{"output_throughput": 139.54466938263022} +{"total_throughput": 184.3028439261215} +{"drafts": 10897} +{"draft_tokens": 21794} +{"draft_utilization_rate": 44.860970909424616} +{"accepted_tokens": 9777} +{"mean_acceptance_length": 1.8972194181884923} diff --git a/outputs/20251009_080902/args.jsonl b/outputs/20251009_080902/args.jsonl new file mode 100644 index 000000000000..b73ac3417ce5 --- /dev/null +++ b/outputs/20251009_080902/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "39"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "3"} +{"spec_token_tree_branching": "3"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_080902/stats.jsonl b/outputs/20251009_080902/stats.jsonl new file mode 100644 index 000000000000..736a2c6d3bee --- /dev/null +++ b/outputs/20251009_080902/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 20703} +{"input_time": 4.888458317999721} +{"output_time": 499.0485995950057} +{"total_time": 503.93705791300545} +{"drafter_forward_time": 145.2754475201973} +{"target_forward_time": 63.64234350096831} +{"forward_ratio": 2.2826853872530157} +{"input_throughput": 1425.3982639768512} +{"output_throughput": 41.48493757281588} +{"total_throughput": 54.909635172686265} +{"drafts": 9676} +{"draft_tokens": 377364} +{"draft_utilization_rate": 2.9226423294219903} +{"accepted_tokens": 11029} +{"mean_acceptance_length": 2.139830508474576} diff --git a/outputs/20251009_083617/args.jsonl b/outputs/20251009_083617/args.jsonl new file mode 100644 index 000000000000..b19e200bc1de --- /dev/null +++ b/outputs/20251009_083617/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "39"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "3"} +{"spec_token_tree_branching": "3"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_083617/stats.jsonl b/outputs/20251009_083617/stats.jsonl new file mode 100644 index 000000000000..8ff1dcd5f984 --- /dev/null +++ b/outputs/20251009_083617/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 20703} +{"input_time": 4.977054404000228} +{"output_time": 448.0072490610046} +{"total_time": 452.9843034650048} +{"drafter_forward_time": 149.39230004907313} +{"target_forward_time": 66.36353935402258} +{"forward_ratio": 2.2511201407165125} +{"input_throughput": 1400.024881062096} +{"output_throughput": 46.21130582907353} +{"total_throughput": 61.08600185113857} +{"drafts": 10039} +{"draft_tokens": 391521} +{"draft_utilization_rate": 2.715818564010615} +{"accepted_tokens": 10633} +{"mean_acceptance_length": 2.05916923996414} diff --git a/outputs/20251009_090208/args.jsonl b/outputs/20251009_090208/args.jsonl new file mode 100644 index 000000000000..55f104d837b0 --- /dev/null +++ b/outputs/20251009_090208/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "0"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_090208/stats.jsonl b/outputs/20251009_090208/stats.jsonl new file mode 100644 index 000000000000..976cda7e7f01 --- /dev/null +++ b/outputs/20251009_090208/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 20703} +{"input_time": 1.575881571001446} +{"output_time": 228.1597421870083} +{"total_time": 229.73562375800975} +{"drafter_forward_time": 0} +{"target_forward_time": 134.5509702269719} +{"forward_ratio": 0.0} +{"input_throughput": 4421.652063341254} +{"output_throughput": 90.73905765124438} +{"total_throughput": 120.44714505899631} +{"drafts": 0} +{"draft_tokens": 0} +{"draft_utilization_rate": 0} +{"accepted_tokens": 0} +{"mean_acceptance_length": 1} diff --git a/outputs/20251009_102405/args.jsonl b/outputs/20251009_102405/args.jsonl new file mode 100644 index 000000000000..55f104d837b0 --- /dev/null +++ b/outputs/20251009_102405/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "0"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_102405/stats.jsonl b/outputs/20251009_102405/stats.jsonl new file mode 100644 index 000000000000..6e8afdea26ec --- /dev/null +++ b/outputs/20251009_102405/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 20703} +{"input_time": 1.6403260199931537} +{"output_time": 236.05312628999673} +{"total_time": 237.69345230998988} +{"drafter_forward_time": 0} +{"target_forward_time": 146.3788958291516} +{"forward_ratio": 0.0} +{"input_throughput": 4247.936029222461} +{"output_throughput": 87.70483291361236} +{"total_throughput": 116.41464975616003} +{"drafts": 0} +{"draft_tokens": 0} +{"draft_utilization_rate": 0} +{"accepted_tokens": 0} +{"mean_acceptance_length": 1} diff --git a/outputs/20251009_104015/args.jsonl b/outputs/20251009_104015/args.jsonl new file mode 100644 index 000000000000..9f61069f2b90 --- /dev/null +++ b/outputs/20251009_104015/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "2"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "True"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_104015/stats.jsonl b/outputs/20251009_104015/stats.jsonl new file mode 100644 index 000000000000..99e13410a916 --- /dev/null +++ b/outputs/20251009_104015/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 215} +{"output_tokens": 363} +{"input_time": 0.0724807079986931} +{"output_time": 3.178420182000991} +{"total_time": 3.2509008899996843} +{"drafter_forward_time": 0.20065858600355568} +{"target_forward_time": 1.5141977740004222} +{"forward_ratio": 0.13251808280858016} +{"input_throughput": 2966.306565381186} +{"output_throughput": 114.20768155690209} +{"total_throughput": 177.79686910112358} +{"drafts": 232} +{"draft_tokens": 464} +{"draft_utilization_rate": 28.01724137931034} +{"accepted_tokens": 130} +{"mean_acceptance_length": 1.5603448275862069} diff --git a/outputs/20251009_104312/args.jsonl b/outputs/20251009_104312/args.jsonl new file mode 100644 index 000000000000..9f61069f2b90 --- /dev/null +++ b/outputs/20251009_104312/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "2"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "2"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "True"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_104705/args.jsonl b/outputs/20251009_104705/args.jsonl new file mode 100644 index 000000000000..a491bb1ac119 --- /dev/null +++ b/outputs/20251009_104705/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_105952/args.jsonl b/outputs/20251009_105952/args.jsonl new file mode 100644 index 000000000000..b7159bd33275 --- /dev/null +++ b/outputs/20251009_105952/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "1000"} +{"dataset_name": "random"} +{"no_stream": "False"} +{"dataset_path": "None"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "None"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "2"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "None"} +{"output_len": "256"} +{"model_dir": "None"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": ""} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_110211/args.jsonl b/outputs/20251009_110211/args.jsonl new file mode 100644 index 000000000000..52ea41516bc3 --- /dev/null +++ b/outputs/20251009_110211/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_110211/stats.jsonl b/outputs/20251009_110211/stats.jsonl new file mode 100644 index 000000000000..42f6d9c06602 --- /dev/null +++ b/outputs/20251009_110211/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 20703} +{"input_time": 1.7560279679983068} +{"output_time": 164.44199532401217} +{"total_time": 166.19802329201048} +{"drafter_forward_time": 6.090164461056702} +{"target_forward_time": 88.01806077895162} +{"forward_ratio": 0.06919221358843076} +{"input_throughput": 3968.0461399158753} +{"output_throughput": 125.89849666569269} +{"total_throughput": 166.49415830525228} +{"drafts": 13037} +{"draft_tokens": 13037} +{"draft_utilization_rate": 58.24192682365575} +{"accepted_tokens": 7593} +{"mean_acceptance_length": 1.5824192682365577} diff --git a/outputs/20251009_110725/args.jsonl b/outputs/20251009_110725/args.jsonl new file mode 100644 index 000000000000..a491bb1ac119 --- /dev/null +++ b/outputs/20251009_110725/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_110725/stats.jsonl b/outputs/20251009_110725/stats.jsonl new file mode 100644 index 000000000000..c1849f3d2215 --- /dev/null +++ b/outputs/20251009_110725/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 20703} +{"input_time": 1.6787991379951563} +{"output_time": 161.6672439630056} +{"total_time": 163.34604310100076} +{"drafter_forward_time": 5.7475049079603195} +{"target_forward_time": 83.76264570095918} +{"forward_ratio": 0.0686165636228764} +{"input_throughput": 4150.585881477921} +{"output_throughput": 128.05933652668364} +{"total_throughput": 169.40110378364267} +{"drafts": 12836} +{"draft_tokens": 12836} +{"draft_utilization_rate": 60.81333748831411} +{"accepted_tokens": 7806} +{"mean_acceptance_length": 1.608133374883141} diff --git a/outputs/20251009_111120/args.jsonl b/outputs/20251009_111120/args.jsonl new file mode 100644 index 000000000000..55f104d837b0 --- /dev/null +++ b/outputs/20251009_111120/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "0"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_111120/stats.jsonl b/outputs/20251009_111120/stats.jsonl new file mode 100644 index 000000000000..740930d0e545 --- /dev/null +++ b/outputs/20251009_111120/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 20703} +{"input_time": 1.6416460930195171} +{"output_time": 231.47462342698418} +{"total_time": 233.1162695200037} +{"drafter_forward_time": 0} +{"target_forward_time": 143.93540399895392} +{"forward_ratio": 0.0} +{"input_throughput": 4244.520198128452} +{"output_throughput": 89.439609809023} +{"total_throughput": 118.70042385705538} +{"drafts": 0} +{"draft_tokens": 0} +{"draft_utilization_rate": 0} +{"accepted_tokens": 0} +{"mean_acceptance_length": 1} diff --git a/outputs/20251009_113116/args.jsonl b/outputs/20251009_113116/args.jsonl new file mode 100644 index 000000000000..ccda84ac7d69 --- /dev/null +++ b/outputs/20251009_113116/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_113158/args.jsonl b/outputs/20251009_113158/args.jsonl new file mode 100644 index 000000000000..ccda84ac7d69 --- /dev/null +++ b/outputs/20251009_113158/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_113158/stats.jsonl b/outputs/20251009_113158/stats.jsonl new file mode 100644 index 000000000000..516a80329fa4 --- /dev/null +++ b/outputs/20251009_113158/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 31376} +{"input_time": 1.7378366129923961} +{"output_time": 245.42142118800984} +{"total_time": 247.15925780100224} +{"drafter_forward_time": 8.522425663930335} +{"target_forward_time": 123.95541888193839} +{"forward_ratio": 0.06875395800200988} +{"input_throughput": 4009.582919306631} +{"output_throughput": 127.84540097648528} +{"total_throughput": 155.138837772657} +{"drafts": 19457} +{"draft_tokens": 19457} +{"draft_utilization_rate": 60.93950763221463} +{"accepted_tokens": 11857} +{"mean_acceptance_length": 1.6093950763221463} diff --git a/outputs/20251009_113829/args.jsonl b/outputs/20251009_113829/args.jsonl new file mode 100644 index 000000000000..a491bb1ac119 --- /dev/null +++ b/outputs/20251009_113829/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "256"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_113829/stats.jsonl b/outputs/20251009_113829/stats.jsonl new file mode 100644 index 000000000000..32b560951740 --- /dev/null +++ b/outputs/20251009_113829/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 20703} +{"input_time": 1.716801205984666} +{"output_time": 162.13642130600783} +{"total_time": 163.8532225119925} +{"drafter_forward_time": 5.6751293028573855} +{"target_forward_time": 83.5976914840976} +{"forward_ratio": 0.06788619640216906} +{"input_throughput": 4058.711035214776} +{"output_throughput": 127.68876871240569} +{"total_throughput": 168.8767518623245} +{"drafts": 12836} +{"draft_tokens": 12836} +{"draft_utilization_rate": 60.81333748831411} +{"accepted_tokens": 7806} +{"mean_acceptance_length": 1.608133374883141} diff --git a/outputs/20251009_115654/args.jsonl b/outputs/20251009_115654/args.jsonl new file mode 100644 index 000000000000..afb9b2557844 --- /dev/null +++ b/outputs/20251009_115654/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "0"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "None"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_115654/stats.jsonl b/outputs/20251009_115654/stats.jsonl new file mode 100644 index 000000000000..d127dcfed890 --- /dev/null +++ b/outputs/20251009_115654/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 31376} +{"input_time": 1.6232240820281731} +{"output_time": 348.01571073798186} +{"total_time": 349.63893482001004} +{"drafter_forward_time": 0} +{"target_forward_time": 207.5297044787003} +{"forward_ratio": 0.0} +{"input_throughput": 4292.691364764425} +{"output_throughput": 90.1568493372494} +{"total_throughput": 109.66742024808832} +{"drafts": 0} +{"draft_tokens": 0} +{"draft_utilization_rate": 0} +{"accepted_tokens": 0} +{"mean_acceptance_length": 1} diff --git a/outputs/20251009_120535/args.jsonl b/outputs/20251009_120535/args.jsonl new file mode 100644 index 000000000000..c8d77523abc5 --- /dev/null +++ b/outputs/20251009_120535/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "1"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_120535/stats.jsonl b/outputs/20251009_120535/stats.jsonl new file mode 100644 index 000000000000..e92594abecc5 --- /dev/null +++ b/outputs/20251009_120535/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 31376} +{"input_time": 1.7491095579898683} +{"output_time": 248.10049085200444} +{"total_time": 249.8496004099943} +{"drafter_forward_time": 8.687068242441455} +{"target_forward_time": 127.30020808226618} +{"forward_ratio": 0.06824080159262226} +{"input_throughput": 3983.741308925122} +{"output_throughput": 126.46488482248203} +{"total_throughput": 153.46832629341355} +{"drafts": 19811} +{"draft_tokens": 19811} +{"draft_utilization_rate": 57.892080157488266} +{"accepted_tokens": 11469} +{"mean_acceptance_length": 1.5789208015748826} diff --git a/outputs/20251009_121123/args.jsonl b/outputs/20251009_121123/args.jsonl new file mode 100644 index 000000000000..62fa521f6530 --- /dev/null +++ b/outputs/20251009_121123/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "39"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "3"} +{"spec_token_tree_branching": "3"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_122058/args.jsonl b/outputs/20251009_122058/args.jsonl new file mode 100644 index 000000000000..ca17f9e1e3ec --- /dev/null +++ b/outputs/20251009_122058/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "6"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "2"} +{"spec_token_tree_branching": "2"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_122058/stats.jsonl b/outputs/20251009_122058/stats.jsonl new file mode 100644 index 000000000000..bc883cdb4584 --- /dev/null +++ b/outputs/20251009_122058/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 31377} +{"input_time": 2.2473068760009483} +{"output_time": 262.3413088579946} +{"total_time": 264.58861573399554} +{"drafter_forward_time": 35.219244842021} +{"target_forward_time": 96.95689070823937} +{"forward_ratio": 0.3632464344179725} +{"input_throughput": 3100.6001336139107} +{"output_throughput": 119.60373353547755} +{"total_throughput": 144.92309086552004} +{"drafts": 14619} +{"draft_tokens": 87714} +{"draft_utilization_rate": 19.093873269945504} +{"accepted_tokens": 16748} +{"mean_acceptance_length": 2.1456323961967305} diff --git a/outputs/20251009_122640/args.jsonl b/outputs/20251009_122640/args.jsonl new file mode 100644 index 000000000000..1701e7dd844f --- /dev/null +++ b/outputs/20251009_122640/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "39"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "3"} +{"spec_token_tree_branching": "3"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_123536/args.jsonl b/outputs/20251009_123536/args.jsonl new file mode 100644 index 000000000000..ca17f9e1e3ec --- /dev/null +++ b/outputs/20251009_123536/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "6"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "2"} +{"spec_token_tree_branching": "2"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_123536/stats.jsonl b/outputs/20251009_123536/stats.jsonl new file mode 100644 index 000000000000..dd095197e323 --- /dev/null +++ b/outputs/20251009_123536/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 31377} +{"input_time": 2.2872065480005404} +{"output_time": 265.46290667100766} +{"total_time": 267.7501132190082} +{"drafter_forward_time": 36.94001909087456} +{"target_forward_time": 101.00375129809254} +{"forward_ratio": 0.3657291795217924} +{"input_throughput": 3046.511040330562} +{"output_throughput": 118.19730445009407} +{"total_throughput": 143.21189088960506} +{"drafts": 14619} +{"draft_tokens": 87714} +{"draft_utilization_rate": 19.093873269945504} +{"accepted_tokens": 16748} +{"mean_acceptance_length": 2.1456323961967305} diff --git a/outputs/20251009_124304/args.jsonl b/outputs/20251009_124304/args.jsonl new file mode 100644 index 000000000000..909a552845bd --- /dev/null +++ b/outputs/20251009_124304/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "6"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "2"} +{"spec_token_tree_branching": "2"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_124304/stats.jsonl b/outputs/20251009_124304/stats.jsonl new file mode 100644 index 000000000000..b541678798f0 --- /dev/null +++ b/outputs/20251009_124304/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 31377} +{"input_time": 2.2765729279999505} +{"output_time": 262.55621435795547} +{"total_time": 264.8327872859554} +{"drafter_forward_time": 36.611532192629966} +{"target_forward_time": 101.00442221272897} +{"forward_ratio": 0.36247454706013893} +{"input_throughput": 3060.7409559779107} +{"output_throughput": 119.50583640432228} +{"total_throughput": 144.78947411672507} +{"drafts": 15239} +{"draft_tokens": 91434} +{"draft_utilization_rate": 17.55911367762539} +{"accepted_tokens": 16055} +{"mean_acceptance_length": 2.053546820657523} diff --git a/outputs/20251009_124953/args.jsonl b/outputs/20251009_124953/args.jsonl new file mode 100644 index 000000000000..75570029e99a --- /dev/null +++ b/outputs/20251009_124953/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "2"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_124953/stats.jsonl b/outputs/20251009_124953/stats.jsonl new file mode 100644 index 000000000000..b3e95d4110ff --- /dev/null +++ b/outputs/20251009_124953/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 31388} +{"input_time": 1.9103547270206036} +{"output_time": 227.44519797300018} +{"total_time": 229.35555270002078} +{"drafter_forward_time": 14.150089318110986} +{"target_forward_time": 111.56500981812133} +{"forward_ratio": 0.12683268115316035} +{"input_throughput": 3647.490123924429} +{"output_throughput": 138.00247391341296} +{"total_throughput": 167.2337972569893} +{"drafts": 16457} +{"draft_tokens": 32914} +{"draft_utilization_rate": 45.23910797836786} +{"accepted_tokens": 14890} +{"mean_acceptance_length": 1.9047821595673573} diff --git a/outputs/20251009_125451/args.jsonl b/outputs/20251009_125451/args.jsonl new file mode 100644 index 000000000000..ed3c8ca1e992 --- /dev/null +++ b/outputs/20251009_125451/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "2"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "None"} +{"spec_token_tree_branching": "None"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_125451/stats.jsonl b/outputs/20251009_125451/stats.jsonl new file mode 100644 index 000000000000..7a9eb0ac6dc1 --- /dev/null +++ b/outputs/20251009_125451/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 31388} +{"input_time": 1.9703581070134533} +{"output_time": 234.573027163995} +{"total_time": 236.54338527100845} +{"drafter_forward_time": 15.048283227861248} +{"target_forward_time": 118.03239825748824} +{"forward_ratio": 0.12749281934467982} +{"input_throughput": 3536.4129876683496} +{"output_throughput": 133.80907591756483} +{"total_throughput": 162.1520718326383} +{"drafts": 16948} +{"draft_tokens": 33896} +{"draft_utilization_rate": 42.34422940759971} +{"accepted_tokens": 14353} +{"mean_acceptance_length": 1.8468845881519944} diff --git a/outputs/20251009_130710/args.jsonl b/outputs/20251009_130710/args.jsonl new file mode 100644 index 000000000000..a248116dd683 --- /dev/null +++ b/outputs/20251009_130710/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "3"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "1"} +{"spec_token_tree_branching": "3"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_130710/stats.jsonl b/outputs/20251009_130710/stats.jsonl new file mode 100644 index 000000000000..2a00ae460eb3 --- /dev/null +++ b/outputs/20251009_130710/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 31376} +{"input_time": 1.9523806880024495} +{"output_time": 230.609521795006} +{"total_time": 232.56190248300845} +{"drafter_forward_time": 20.154581754843093} +{"target_forward_time": 105.75494470667036} +{"forward_ratio": 0.19057815037155296} +{"input_throughput": 3568.9760930432117} +{"output_throughput": 136.0568278177639} +{"total_throughput": 164.8765321860983} +{"drafts": 15938} +{"draft_tokens": 47814} +{"draft_utilization_rate": 32.095202242021166} +{"accepted_tokens": 15346} +{"mean_acceptance_length": 1.962856067260635} diff --git a/outputs/20251009_131242/args.jsonl b/outputs/20251009_131242/args.jsonl new file mode 100644 index 000000000000..d547fe3798c4 --- /dev/null +++ b/outputs/20251009_131242/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "3"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "3"} +{"spec_token_tree_branching": "1"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt"} +{"draft_vocab_frequency_keep_threshold": "1"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_131242/stats.jsonl b/outputs/20251009_131242/stats.jsonl new file mode 100644 index 000000000000..bb19383a2bef --- /dev/null +++ b/outputs/20251009_131242/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 31376} +{"input_time": 2.0183868530002655} +{"output_time": 232.65246776396816} +{"total_time": 234.67085461696843} +{"drafter_forward_time": 19.970247890054452} +{"target_forward_time": 106.6540147569358} +{"forward_ratio": 0.18724328320473066} +{"input_throughput": 3452.2618841092317} +{"output_throughput": 134.86209839747647} +{"total_throughput": 163.39481126697805} +{"drafts": 15938} +{"draft_tokens": 47814} +{"draft_utilization_rate": 32.095202242021166} +{"accepted_tokens": 15346} +{"mean_acceptance_length": 1.962856067260635} diff --git a/outputs/20251009_131742/args.jsonl b/outputs/20251009_131742/args.jsonl new file mode 100644 index 000000000000..2091048aa606 --- /dev/null +++ b/outputs/20251009_131742/args.jsonl @@ -0,0 +1,56 @@ +{"seed": "0"} +{"request_id_prefix": ""} +{"num_prompts": "100"} +{"dataset_name": "hf"} +{"no_stream": "False"} +{"dataset_path": "philschmid/mt-bench"} +{"custom_output_len": "256"} +{"custom_skip_chat_template": "True"} +{"spec_bench_output_len": "256"} +{"spec_bench_category": "None"} +{"sonnet_input_len": "550"} +{"sonnet_output_len": "150"} +{"sonnet_prefix_len": "200"} +{"sharegpt_output_len": "None"} +{"blazedit_min_distance": "0.0"} +{"blazedit_max_distance": "1.0"} +{"random_input_len": "1024"} +{"random_output_len": "128"} +{"random_range_ratio": "0.0"} +{"random_prefix_len": "0"} +{"random_batch_size": "1"} +{"random_mm_base_items_per_request": "1"} +{"random_mm_num_mm_items_range_ratio": "0.0"} +{"random_mm_limit_mm_per_prompt": "{'image': 255, 'video': 0}"} +{"random_mm_bucket_config": "{(256, 256, 1): 0.5, (720, 1280, 1): 0.5, (720, 1280, 16): 0.0}"} +{"hf_subset": "None"} +{"hf_split": "train"} +{"hf_name": "None"} +{"hf_output_len": "None"} +{"prefix_repetition_prefix_len": "256"} +{"prefix_repetition_suffix_len": "256"} +{"prefix_repetition_num_prefixes": "10"} +{"prefix_repetition_output_len": "128"} +{"method": "eagle"} +{"num_spec_tokens": "3"} +{"spec_token_tree": "None"} +{"spec_token_tree_depth": "1"} +{"spec_token_tree_branching": "3"} +{"prompt_lookup_max": "5"} +{"prompt_lookup_min": "2"} +{"tp": "1"} +{"enforce_eager": "False"} +{"enable_chunked_prefill": "False"} +{"temp": "0"} +{"top_p": "1.0"} +{"top_k": "-1"} +{"print_output": "False"} +{"max_num_seqs": "1"} +{"output_len": "512"} +{"model_dir": "NousResearch/Meta-Llama-3-8B-Instruct"} +{"eagle_dir": "yuhuili/EAGLE-LLaMA3-Instruct-8B"} +{"custom_mm_prompts": "False"} +{"draft_vocab_frequency_path": "None"} +{"draft_vocab_frequency_keep_threshold": "None"} +{"compilation_config": "{\"level\": \"0\"}"} +{"endpoint_type": "openai-chat"} diff --git a/outputs/20251009_131742/stats.jsonl b/outputs/20251009_131742/stats.jsonl new file mode 100644 index 000000000000..241ed5e19fb9 --- /dev/null +++ b/outputs/20251009_131742/stats.jsonl @@ -0,0 +1,16 @@ +{"input_tokens": 6968} +{"output_tokens": 31376} +{"input_time": 1.9752304940011527} +{"output_time": 229.43122414098616} +{"total_time": 231.4064546349873} +{"drafter_forward_time": 19.401185439142864} +{"target_forward_time": 101.96877366827175} +{"forward_ratio": 0.19026594849771808} +{"input_throughput": 3527.6895639076406} +{"output_throughput": 136.75557944423187} +{"total_throughput": 165.69978594798715} +{"drafts": 15370} +{"draft_tokens": 46110} +{"draft_utilization_rate": 34.64324441552809} +{"accepted_tokens": 15974} +{"mean_acceptance_length": 2.0392973324658428} diff --git a/plot.py b/plot.py new file mode 100644 index 000000000000..d9fbe22b3c04 --- /dev/null +++ b/plot.py @@ -0,0 +1,127 @@ +import marimo + +__generated_with = "0.15.5" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import json + from pathlib import Path + + import pandas as pd + import plotly.express as px + from plotly.subplots import make_subplots + import plotly.graph_objects as go + return Path, go, json, make_subplots, pd + + +@app.cell +def _(): + all_runs_dir = '/home/ubuntu/vllm/final_outputs/spec_outputs' + return (all_runs_dir,) + + +@app.cell +def _(Path, json): + def parse_run(dir, keep=None): + dir = Path(dir) + data = {} + # read in args + with open(dir / "args.jsonl") as f: + for line in f: + data |= json.loads(line.strip()) + + # read in stats + with open(dir / "stats.jsonl") as f: + for line in f: + data |= json.loads(line.strip()) + + # parse stats + if keep: + data = {k: data[k] for k in keep if k in data} + return data + return (parse_run,) + + +@app.cell +def _(Path, parse_run, pd): + def parse_all_runs(all_runs_dir, keep=None): + all_runs_dir = Path(all_runs_dir) + run_dirs = [all_runs_dir / d.name for d in Path(all_runs_dir).iterdir() if d.is_dir()] + data = [parse_run(run_dir, keep) for run_dir in run_dirs] + return pd.DataFrame(data) + return (parse_all_runs,) + + +@app.function +def fix_dataframe(df): + + # convert to ints + df['num_spec_tokens'] = df['num_spec_tokens'].astype(int) + + # sort by num_spec_tokens + df = df.sort_values('num_spec_tokens') + + # add method column + def get_method(row): + if row['num_spec_tokens'] == 0: + return 'vanilla' + elif row['draft_vocab_frequency_keep_threshold'] == 'None': + return 'fr-spec' + else: + return 'eagle' + df['method'] = df.apply(get_method, axis=1) + + return df + + +@app.cell +def _(all_runs_dir, parse_all_runs): + keep = ['num_spec_tokens', 'spec_token_tree', 'forward_ratio', 'total_time_sec', 'acceptance_length', 'output_speed', 'draft_vocab_frequency_path', 'draft_vocab_frequency_keep_threshold'] + df_raw = parse_all_runs(all_runs_dir, keep=keep) + df_raw + return (df_raw,) + + +@app.cell +def _(df_raw): + df = fix_dataframe(df_raw) + df + return (df,) + + +@app.cell +def _(df, go, make_subplots): + metrics = ['acceptance_length', 'output_speed', 'forward_ratio', 'total_time_sec'] + titles = ['Mean Acceptance Length', 'Decoding Throughput (toks/sec)', 'Forward Pass Ratio (D:T)', 'Total Time (sec)'] + colors = {'vanilla': 'blue', 'eagle': 'red', 'fr-spec': 'green'} + + fig = make_subplots(rows=2, cols=2, subplot_titles=titles) + + for i, (metric, title) in enumerate(zip(metrics, titles)): + row, col = (i//2) + 1, (i%2) + 1 + + for method in ['vanilla', 'eagle', 'fr-spec']: + data = df[df['method'] == method].sort_values('num_spec_tokens') + fig.add_trace( + go.Scatter(x=data['num_spec_tokens'], y=data[metric], + mode='lines+markers', name=method, + line=dict(color=colors[method]), + showlegend=(i==0)), + row=row, col=col + ) + + fig.update_xaxes(title_text="Num Spec Tokens") + fig.update_layout(height=600, title="Speculative Decoding Metrics") + fig.show() + return + + +@app.cell +def _(): + return + + +if __name__ == "__main__": + app.run() diff --git a/plot_backup.py b/plot_backup.py new file mode 100644 index 000000000000..46ab9bc1bd34 --- /dev/null +++ b/plot_backup.py @@ -0,0 +1,82 @@ +import json +from pathlib import Path +import pandas as pd + +dir = '/home/ubuntu/vllm/final_outputs/spec_outputs/20250917_161348' +all_runs_dir = '/home/ubuntu/vllm/final_outputs/spec_outputs' + +def parse_run(dir, keep=None): + dir = Path(dir) + data = {} + # read in args + with open(dir / "args.jsonl") as f: + for line in f: + data |= json.loads(line.strip()) + + # read in stats + with open(dir / "stats.jsonl") as f: + for line in f: + data |= json.loads(line.strip()) + + # parse stats + if keep: + data = {k: data[k] for k in keep if k in data} + return data + +def parse_all_runs(all_runs_dir, keep=None): + all_runs_dir = Path(all_runs_dir) + run_dirs = [all_runs_dir / d.name for d in Path(all_runs_dir).iterdir() if d.is_dir()] + data = [parse_run(run_dir, keep) for run_dir in run_dirs] + return pd.DataFrame(data) + +def fix_dataframe(df): + + # convert to ints + df['num_spec_tokens'] = df['num_spec_tokens'].astype(int) + + # sort by num_spec_tokens + df = df.sort_values('num_spec_tokens') + + # add method column + def get_method(row): + if row['num_spec_tokens'] == 0: + return 'vanilla' + elif row['draft_vocab_frequency_keep_threshold'] == 'None': + return 'fr-spec' + else: + return 'eagle' + df['method'] = df.apply(get_method, axis=1) + + return df + +keep = ['num_spec_tokens', 'spec_token_tree', 'forward_ratio', 'total_time_sec', 'acceptance_length', 'output_speed', 'draft_vocab_frequency_path', 'draft_vocab_frequency_keep_threshold'] +df = parse_all_runs(all_runs_dir, keep=keep) +df = fix_dataframe(df) +df + +import plotly.express as px +from plotly.subplots import make_subplots +import plotly.graph_objects as go + +metrics = ['acceptance_length', 'output_speed', 'forward_ratio', 'total_time_sec'] +titles = ['Mean Acceptance Length', 'Decoding Throughput (toks/sec)', 'Forward Pass Ratio (D:T)', 'Total Time (sec)'] +colors = {'vanilla': 'blue', 'eagle': 'red', 'fr-spec': 'green'} + +fig = make_subplots(rows=2, cols=2, subplot_titles=titles) + +for i, (metric, title) in enumerate(zip(metrics, titles)): + row, col = (i//2) + 1, (i%2) + 1 + + for method in ['vanilla', 'eagle', 'fr-spec']: + data = df[df['method'] == method].sort_values('num_spec_tokens') + fig.add_trace( + go.Scatter(x=data['num_spec_tokens'], y=data[metric], + mode='lines+markers', name=method, + line=dict(color=colors[method]), + showlegend=(i==0)), + row=row, col=col + ) + +fig.update_xaxes(title_text="Num Spec Tokens") +fig.update_layout(height=600, title="Speculative Decoding Metrics") +fig.show() diff --git a/scripts/spec.sh b/scripts/spec.sh new file mode 100644 index 000000000000..2a12b70dea43 --- /dev/null +++ b/scripts/spec.sh @@ -0,0 +1,92 @@ +# params +# model = "meta-llama/Llama-3.1-8B-Instruct" +# data = mt-bench +# num-prompts = 100 +# max-num-seqs = 1 +# compilation = False + +#***** vanilla ***** + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 0 + +#***** eagle ***** + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 1 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 2 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 4 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 6 + +# fr-spec + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 1 \ + --draft-vocab-frequency-path 'eturok/llama-3.1-8b-instruct-vocab-freq/vocab_freq.pt' \ + --draft-vocab-frequency-keep-threshold 0.25 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 2 \ + --draft-vocab-frequency-path 'eturok/llama-3.1-8b-instruct-vocab-freq/vocab_freq.pt' \ + --draft-vocab-frequency-keep-threshold 0.25 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 4 \ + --draft-vocab-frequency-path 'eturok/llama-3.1-8b-instruct-vocab-freq/vocab_freq.pt' \ + --draft-vocab-frequency-keep-threshold 0.25 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 6 \ + --draft-vocab-frequency-path 'eturok/llama-3.1-8b-instruct-vocab-freq/vocab_freq.pt' \ + --draft-vocab-frequency-keep-threshold 0.25 diff --git a/scripts/spec_tree.sh b/scripts/spec_tree.sh new file mode 100644 index 000000000000..e7ac73c746c1 --- /dev/null +++ b/scripts/spec_tree.sh @@ -0,0 +1,92 @@ +# params +# model = "meta-llama/Llama-3.1-8B-Instruct" +# data = mt-bench +# num-prompts = 100 +# max-num-seqs = 1 +# compilation = False + +# #***** vanilla ***** + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 0 + +#***** eagle ***** + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 1 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 2 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 4 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 6 + +# fr-spec + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 1 \ + --draft-vocab-frequency-path 'eturok/llama-3.1-8b-instruct-vocab-freq/vocab_freq.pt' \ + --draft-vocab-frequency-keep-threshold 0.25 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 2 \ + --draft-vocab-frequency-path 'eturok/llama-3.1-8b-instruct-vocab-freq/vocab_freq.pt' \ + --draft-vocab-frequency-keep-threshold 0.25 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 4 \ + --draft-vocab-frequency-path 'eturok/llama-3.1-8b-instruct-vocab-freq/vocab_freq.pt' \ + --draft-vocab-frequency-keep-threshold 0.25 + +VLLM_USE_V1=1 python3 examples/offline_inference/spec_decode.py \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 100 \ + --max-num-seqs 1 \ + --compilation-config '{"level": "0"}' \ + --num-spec-tokens 6 \ + --draft-vocab-frequency-path 'eturok/llama-3.1-8b-instruct-vocab-freq/vocab_freq.pt' \ + --draft-vocab-frequency-keep-threshold 0.25 diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 46e3a611c6d2..680867f67a4f 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - from typing import Optional from unittest import mock @@ -15,19 +14,22 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) +from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.platforms import current_platform from vllm.v1.spec_decode.eagle import EagleProposer -model_dir = "meta-llama/Llama-3.1-8B-Instruct" +model_dir = "NousResearch/Meta-Llama-3-8B-Instruct" # "meta-llama/Llama-3.1-8B-Instruct" eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" - +vocab_freq_dir = "eturok/llama-3.1-8b-instruct-vocab-freq/vocab_freq.pt" +draft_vocab_frequency_keep_threshold = 0.5 def _create_proposer( method: str, num_speculative_tokens: int, speculative_token_tree: Optional[list[tuple[int]]] = None, + prune_vocab: bool = False, ) -> EagleProposer: model_config = ModelConfig(model=model_dir, runner="generate", @@ -41,6 +43,10 @@ def _create_proposer( assert num_speculative_tokens == len(speculative_token_tree) spec_token_tree_str = str(speculative_token_tree) + draft_vocab_frequency_path = None + if prune_vocab: + draft_vocab_frequency_path = vocab_freq_dir + speculative_config = SpeculativeConfig( target_model_config=model_config, target_parallel_config=ParallelConfig(), @@ -48,6 +54,8 @@ def _create_proposer( method=method, num_speculative_tokens=num_speculative_tokens, speculative_token_tree=spec_token_tree_str, + draft_vocab_frequency_path=draft_vocab_frequency_path, + draft_vocab_frequency_keep_threshold=draft_vocab_frequency_keep_threshold, ) vllm_config = VllmConfig( @@ -137,13 +145,19 @@ def test_prepare_inputs(): get_attn_backend_list_based_on_platform()) @pytest.mark.parametrize("pp_size", [1, 2]) @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False]) +@pytest.mark.parametrize("prune_vocab", [True, False]) @mock.patch('vllm.v1.spec_decode.eagle.get_pp_group') @mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config') @mock.patch('vllm.v1.spec_decode.eagle.get_model') -def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, - attn_backend, pp_size, use_distinct_embed_tokens, +@mock.patch('copy.deepcopy') +def test_load_model(mock_deepcopy, mock_get_model, mock_get_layers, mock_get_pp_group, method, + attn_backend, pp_size, use_distinct_embed_tokens, prune_vocab, monkeypatch): + # Skip if prune_vocab=True and method != "eagle" + if prune_vocab and method != "eagle": + pytest.skip("prune_vocab only applies to eagle method") + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend) if (attn_backend == "TRITON_ATTN_VLLM_V1" @@ -163,6 +177,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, else: mock_model.model.embed_tokens.weight.shape = (131072, 4096) + mock_model.lm_head.data.shape = (131072, 4096) mock_get_model.return_value = mock_model # Setup mocks for attention layers @@ -197,20 +212,42 @@ class _TargetModelStub(LlamaForCausalLM): assert not isinstance(target_model, SupportsMultiModal) if method == "eagle": + # Setup the lm_head with data, device, and shape target_model.lm_head = mock.MagicMock() + device = torch.device(current_platform.device_type) + target_model.lm_head.weight.device = device + target_model.lm_head.weight.shape = (131072, 4096) + + # Create mock copy.deepcopy + if prune_vocab: + def my_deepcopy(obj, memo=None): + if hasattr(obj, 'data') and hasattr(obj.data, 'device'): + return mock.MagicMock(data=mock.MagicMock(device=obj.data.device, shape=obj.data.shape)) + return obj + mock_deepcopy.side_effect = my_deepcopy # Create proposer using the helper function - proposer = _create_proposer(method, num_speculative_tokens=8) + proposer = _create_proposer(method, num_speculative_tokens=8, prune_vocab=prune_vocab) # Call the method under test proposer.load_model(target_model) + # # Manually set the pruned vocab size + # if method == "eagle" and prune_vocab: + # proposer.model.lm_head.weight.data.shape = (32768, 4096) + # Verify common interactions mock_get_model.assert_called_once() - # Verify that EAGLE models gain the lm head from the target model if method == "eagle": - assert proposer.model.lm_head == target_model.lm_head + if prune_vocab: + # Verify that the vocab of EAGLE models is pruned to the correct ratio + pruned_vocab_size = proposer.model.lm_head.weight.data.shape[0] + original_vocab_size = target_model.lm_head.weight.data.shape[0] + assert pruned_vocab_size/original_vocab_size == 0.25, f"{pruned_vocab_size/original_vocab_size=}" + else: + # Verify that EAGLE models have the same lm head as the target model + assert proposer.model.lm_head == target_model.lm_head # Verify that the embed tokens are set correctly # If pp_size is > 1, the embed tokens should be distinct @@ -228,7 +265,8 @@ class _TargetModelStub(LlamaForCausalLM): @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8]) -def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): +@pytest.mark.parametrize("prune_vocab", [True, False]) +def test_propose(method, attn_backend, num_speculative_tokens, prune_vocab, monkeypatch): monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend) @@ -256,13 +294,16 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): seq_lens = [seq_len_1, seq_len_2] # Create proposer first so we can use its actual hidden_size - proposer = _create_proposer("eagle", num_speculative_tokens) + proposer = _create_proposer(method, num_speculative_tokens, prune_vocab=prune_vocab) # Get the hidden_size from the proposer to ensure consistency hidden_size = proposer.hidden_size # Helper to create deterministic logits that will produce specific tokens - def create_deterministic_logits(token_ids): + def create_deterministic_logits(token_ids:list[int], vocab_size:int): logits = torch.full((batch_size, vocab_size), -100.0, device=device) + # simulate pruning the vocabulary of the draft model + if prune_vocab: + token_ids = [pruned_token_ids.index(token_id) for token_id in token_ids] for i, token_id in enumerate(token_ids): logits[i, token_id] = 100.0 return logits @@ -272,9 +313,21 @@ def create_deterministic_logits(token_ids): # Sequence 2: 60, 61, 62, ... base_token_ids = [42, 60] + # prune the vocab of the draft model + if prune_vocab: + # make sure our pruned vocab is larger enough to cover all base tokens and the `num_speculative_tokens` we will generate + pruned_token_ids = [i for base in base_token_ids for i in range(base, base + num_speculative_tokens + 1)] + pruned_vocab_size = len(pruned_token_ids) + + # Set up the mock model with a custom class so that + # isinstance() checks match the expected type. + if method == "eagle3": + model_mock = mock.create_autospec(Eagle3LlamaForCausalLM, instance=True) + model_mock.combine_hidden_states.side_effect = lambda x: x + else: + model_mock = mock.MagicMock() + # Skip loading the model and replace it with a mock directly - # Create the mock model with deterministic outputs - model_mock = mock.MagicMock() # Setup for model forward calls forward_returns = [] @@ -298,10 +351,11 @@ def create_deterministic_logits(token_ids): # Setup for compute_logits calls logits_returns = [] + logit_vocab_size = pruned_vocab_size if prune_vocab else vocab_size for i in range(num_speculative_tokens): # For each call, increment the base token IDs current_tokens = [base_id + i for base_id in base_token_ids] - logits_returns.append(create_deterministic_logits(current_tokens)) + logits_returns.append(create_deterministic_logits(current_tokens, logit_vocab_size)) if num_speculative_tokens == 1: model_mock.compute_logits.return_value = logits_returns[0] @@ -314,6 +368,10 @@ def create_deterministic_logits(token_ids): # Assign draft attn_layer_names since load_model is not invoked proposer.attn_layer_names = ["layer.0"] + # Assign pruned token ids to the proposer + if prune_vocab: + proposer.pruned_token_ids = torch.tensor(pruned_token_ids, device=device) + # Create input tensors batch_spec = BatchSpec( seq_lens=seq_lens, @@ -394,7 +452,6 @@ def create_deterministic_logits(token_ids): # Verify all tokens match our expectations assert torch.equal(result, expected_tokens) - @pytest.mark.parametrize( "spec_token_tree", [ @@ -404,7 +461,8 @@ def create_deterministic_logits(token_ids): [(0, ), (1, ), (2, ), (0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)], # Tree ]) -def test_propose_tree(spec_token_tree): +@pytest.mark.parametrize("prune_vocab", [True, False]) +def test_propose_tree(spec_token_tree, prune_vocab): # Get GPU device. device = torch.device(current_platform.device_type) @@ -420,13 +478,16 @@ def test_propose_tree(spec_token_tree): # Create proposer first so we can use its actual hidden_size. proposer = _create_proposer("eagle", num_speculative_tokens, - speculative_token_tree=spec_token_tree) + speculative_token_tree=spec_token_tree, + prune_vocab=prune_vocab) # Get the hidden_size from the proposer to ensure consistency. hidden_size = proposer.hidden_size # Helper to create deterministic logits that will produce specific tokens - def create_deterministic_logits(token_ids, k: int): + def create_deterministic_logits(token_ids:list[int], vocab_size:int, k: int): logits = torch.full((batch_size, vocab_size), -100.0, device=device) + if prune_vocab: + token_ids = [pruned_token_ids.index(token_id) for token_id in token_ids] for i, token_id in enumerate(token_ids): # Assign decreasing values to the k, consecutive, tokens. for j in range(k): @@ -436,6 +497,12 @@ def create_deterministic_logits(token_ids, k: int): # Mock a model that returns deterministic logits. base_token_ids = torch.tensor([42, 60], dtype=torch.int64, device=device) + # prune the vocab of the draft model + if prune_vocab: + # make sure our pruned vocab is larger enough to cover all base tokens and the `num_speculative_tokens` we will generate + pruned_token_ids = [i for base in base_token_ids for i in range(base, base + num_speculative_tokens + 1)] + pruned_vocab_size = len(pruned_token_ids) + # Skip loading the model and replace it with a mock that returns # deterministic outputs. model_mock = mock.MagicMock() @@ -458,6 +525,7 @@ def create_deterministic_logits(token_ids, k: int): dtype=torch.int32, device=device) logits_returns = [] + logit_vocab_size = pruned_vocab_size if prune_vocab else vocab_size for level, num_children in enumerate(proposer.child_drafts_per_level): token_ids = base_token_ids + cu_num_drafts_tensor[level] level_num_drafts = cu_num_drafts_tensor[ @@ -466,6 +534,7 @@ def create_deterministic_logits(token_ids, k: int): for i in range(level_num_drafts // num_children): level_logits.append( create_deterministic_logits(token_ids + i * num_children, + logit_vocab_size, num_children)) logits_returns.append(torch.stack(level_logits, dim=1)) model_mock.compute_logits.side_effect = logits_returns @@ -476,6 +545,10 @@ def create_deterministic_logits(token_ids, k: int): # Assign draft attn_layer_names since load_model is not invoked proposer.attn_layer_names = ["layer.0"] + # Assign pruned token ids to the proposer + if prune_vocab: + proposer.pruned_token_ids = torch.tensor(pruned_token_ids, device=device) + # Get the tree attention metadata builder. attn_metadata_builder_cls, _ = get_attention_backend(_Backend.TREE_ATTN) attn_metadata_builder = attn_metadata_builder_cls( diff --git a/tests/v1/spec_decode/test_vocab_prune.py b/tests/v1/spec_decode/test_vocab_prune.py new file mode 100644 index 000000000000..7f62bab3072b --- /dev/null +++ b/tests/v1/spec_decode/test_vocab_prune.py @@ -0,0 +1,44 @@ +import torch +import pytest +from unittest.mock import patch + +from vllm.v1.spec_decode.eagle import load_vocab_freq, prune_draft_vocab + +@pytest.mark.parametrize("mock_tensor", [ + torch.tensor([10, 20, 30, 40, 50], dtype=torch.float32), + torch.tensor([1, 1, 1, 1, 1, 1], dtype=torch.float32), + torch.tensor([100, 50, 25, 12, 6, 3], dtype=torch.float32) +]) +@patch("vllm.v1.spec_decode.eagle.hf_hub_download") +@patch("vllm.v1.spec_decode.eagle.torch.load") +def test_load_vocab_freq(mock_torch_load, mock_hf_hub_download, mock_tensor): + # Mock download & load + mock_hf_hub_download.return_value = "mock_draft_vocab_freq.pt" + mock_torch_load.return_value = mock_tensor + + vocab_freq = load_vocab_freq("user/repo/mock_file.pt") + + # Ensure tensor is loaded and converted correctly + assert isinstance(vocab_freq, torch.Tensor) + assert vocab_freq.dtype == torch.int64 + assert vocab_freq.numel() == mock_tensor.numel() + assert torch.all(vocab_freq >= 0) + + +def test_prune_draft_vocab(): + # Frequencies designed to test cumulative mass + vocab_freqs = torch.ones(10, dtype=torch.float32) + vocab_freqs[9] = 9.0 # 50% of total mass + + # 50% threshold: only the largest token + pruned_vocab = prune_draft_vocab(vocab_freqs, 0.5) + assert torch.equal(pruned_vocab, torch.tensor([9])) + + # Slightly above 50%: include one more + pruned_vocab = prune_draft_vocab(vocab_freqs, 0.51) + assert torch.equal(pruned_vocab, torch.tensor([9, 0])) + + # Near total mass: include all tokens + pruned_vocab = prune_draft_vocab(vocab_freqs, 0.99) + expected = torch.tensor([9, 0, 1, 2, 3, 4, 5, 6, 7, 8]) + assert torch.equal(pruned_vocab, expected) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 784536054a19..3ccb0fec3eb5 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -99,8 +99,8 @@ def __init__( ) -> None: """ Initialize the BenchmarkDataset with an optional dataset path and random - seed. - + seed. + Args: dataset_path (Optional[str]): Path to the dataset. If None, it indicates that a default or random dataset might be used. @@ -132,10 +132,10 @@ def apply_multimodal_chat_transformation( elif isinstance(mm_content, dict): content.append(mm_content) else: - raise TypeError( + raise TypeError( "Could not process multimodal content of type: " + - f"{type(mm_content)}" - ) + f"{type(mm_content)}" + ) return [{"role": "user", "content": content}] def load_data(self) -> None: @@ -198,7 +198,7 @@ def get_random_lora_request( @abstractmethod def sample(self, tokenizer: PreTrainedTokenizerBase, - num_requests: int, + num_requests: int, request_id_prefix: str = "") -> list[SampleRequest]: """ Abstract method to generate sample requests from the dataset. @@ -211,7 +211,7 @@ def sample(self, tokenizer: PreTrainedTokenizerBase, for processing the dataset's text. num_requests (int): The number of sample requests to generate. request_id_prefix (str) The prefix of request_id. - + Returns: list[SampleRequest]: A list of sample requests generated from the @@ -518,7 +518,7 @@ def get_sampling_params( size=num_requests) output_lens = self._rng.integers(output_low, output_high + 1, size=num_requests) - offsets = self._rng.integers(0, tokenizer.vocab_size, + offsets = self._rng.integers(0, tokenizer.vocab_size, size=num_requests) return input_lens, output_lens, offsets @@ -546,7 +546,7 @@ def generate_token_sequence( the encoded sequence is truncated before being decode again. """ # Build the inner sequence by sampling sequentially from the vocab - inner_seq = ((offset + index + np.arange(input_len)) + inner_seq = ((offset + index + np.arange(input_len)) % vocab_size).tolist() token_sequence = prefix_token_ids + inner_seq @@ -581,9 +581,9 @@ class RandomMultiModalDataset(RandomDataset): `num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0. The maximum is further clamped to the sum of per-modality limits. 2) Each item’s modality and shape is sampled from `bucket_config`, a dict - mapping (height, width, num_frames) → probability. We treat - `num_frames`=1 as image and and `num_frames` > 1 as video. - Entries with zero probability are removed and the rest are renormalized + mapping (height, width, num_frames) → probability. We treat + `num_frames`=1 as image and and `num_frames` > 1 as video. + Entries with zero probability are removed and the rest are renormalized to sum to 1. 3) Per-modality hard caps are enforced via `limit_mm_per_prompt`. When a modality reaches its cap, all of its buckets are excluded and the @@ -591,8 +591,8 @@ class RandomMultiModalDataset(RandomDataset): Example bucket configuration: {(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1} - - Two image buckets (`num_frames`=1) and one video bucket - (`num_frames`=16). + - Two image buckets (`num_frames`=1) and one video bucket + (`num_frames`=16). OBS.: Only image sampling is supported for now. """ @@ -615,9 +615,9 @@ def __init__(self, **kwargs) -> None: def generate_synthetic_image(self, width: int, height: int) -> Image.Image: """Generate synthetic PIL image with random RGB values. - - NOTE: iid pixel sampling results in worst-case compression - (good for stressing I/O), but very unlike real photos. + + NOTE: iid pixel sampling results in worst-case compression + (good for stressing I/O), but very unlike real photos. We could consider a “low-freq” mode (e.g., noise blur) to emulate network realism instead of max stress. """ @@ -629,11 +629,11 @@ def generate_synthetic_image(self, width: int, height: int) -> Image.Image: ) return Image.fromarray(random_pixels) - def generate_synthetic_video(self, width: int, - height: int, + def generate_synthetic_video(self, width: int, + height: int, num_frames: int) -> Any: """Generate synthetic video with random values. - + TODO: Finish this method. """ raise NotImplementedError("Video sampling is WIP.") @@ -647,7 +647,7 @@ def map_config_to_modality(self, config: tuple[int, int, int]) -> str: else: raise ValueError(f"Invalid multimodal item configuration: {config}") - def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int], + def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int], float]) -> dict[tuple[int, int, int], float]: """ Remove zero probability entries @@ -667,24 +667,24 @@ def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int], return {k: v / total for k, v in bucket_config.items()} - def generate_mm_item(self, + def generate_mm_item(self, mm_item_config: tuple[int, int, int], ) -> Mapping[str, Any]: """ - Create synthetic images and videos and + Create synthetic images and videos and apply process_image/process_video respectively. This follows the OpenAI API chat completions https://github.com/openai/openai-python """ - + if self.map_config_to_modality(mm_item_config) == "image": return process_image(self.generate_synthetic_image( mm_item_config[1], mm_item_config[0])) elif self.map_config_to_modality(mm_item_config) == "video": return process_video(self.generate_synthetic_video( - mm_item_config[1], - mm_item_config[0], + mm_item_config[1], + mm_item_config[0], mm_item_config[2])) else: raise ValueError(f"Invalid multimodal item configuration: " @@ -714,17 +714,17 @@ def get_mm_item_sampling_params( f"limit_mm_per_prompt: " f"{limit_mm_per_prompt.keys()}") - # Remove zero probability entries + # Remove zero probability entries # and normalize bucket config to sum to 1 bucket_config = self.normalize_bucket_config(bucket_config) logger.info( "Normalized bucket config: %s", bucket_config, ) # Only consider limit per prompt for modalities in bucket config - allowed_modalities = {self.map_config_to_modality(cfg) + allowed_modalities = {self.map_config_to_modality(cfg) for cfg in bucket_config} limit_mm_per_prompt = { - k: v for k, v in limit_mm_per_prompt.items() + k: v for k, v in limit_mm_per_prompt.items() if k in allowed_modalities} if not limit_mm_per_prompt: raise ValueError("No valid limits for modalities present in " @@ -737,19 +737,19 @@ def get_mm_item_sampling_params( # Get max and min num mm items and ensure # it is at most the sum of limit_mm_per_prompt for all modalities max_num_mm_items = min( - sum(limit_mm_per_prompt.values()), + sum(limit_mm_per_prompt.values()), math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio)) ) # Ensure min num mm items is at least 0 min_num_mm_items = max( - 0, + 0, math.floor(base_items_per_request * (1 - num_mm_items_range_ratio)) ) # Raise error if min num mm items is greater than max num mm items if min_num_mm_items > max_num_mm_items: raise ValueError(f"Min num mm items is greater than max mm items: " f"{min_num_mm_items} > {max_num_mm_items}") - + logger.info( "Sampling number of multimodal items from [%s, %s]", min_num_mm_items, max_num_mm_items, @@ -774,8 +774,8 @@ def get_mm_item_iterator( whose size is between min_num_mm_items and max_num_mm_items. Loop over the bucket config and sample a multimodal item. - Loop until the number of multimodal items sampled is equal to - request_num_mm_items or limit of multimodal items per prompt + Loop until the number of multimodal items sampled is equal to + request_num_mm_items or limit of multimodal items per prompt for all modalities is reached. Note: @@ -787,19 +787,19 @@ def get_mm_item_iterator( # Get the number of multimodal items to sample request_num_mm_items = int( self._rng.integers(min_num_mm_items, max_num_mm_items + 1) - ) + ) # If request_num_mm_items is 0, yield an empty iterator if request_num_mm_items == 0: return # Initialize modality counters - modality_counter = {self.map_config_to_modality(k): 0 + modality_counter = {self.map_config_to_modality(k): 0 for k in bucket_config} # Copy the bucket config to avoid modifying the original bucket_config_copy = bucket_config.copy() # Loop over the number of multimodal items to sample while sum(modality_counter.values()) < request_num_mm_items: # Sample a multimodal item config - mm_item_config = self._rng.choice(list(bucket_config_copy.keys()), + mm_item_config = self._rng.choice(list(bucket_config_copy.keys()), p=list(bucket_config_copy.values())) modality = self.map_config_to_modality(mm_item_config) # Check that modality count is less than limit per prompt @@ -839,7 +839,7 @@ def sample( limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT, base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST, num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO, - bucket_config: dict[tuple[int, int, int], float] = + bucket_config: dict[tuple[int, int, int], float] = DEFAULT_MM_ITEM_BUCKET_CONFIG, enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT, **kwargs, @@ -847,7 +847,7 @@ def sample( # NOTE: Video sampling is WIP. Raise error if video is in bucket config # and probability is non-zero. - if any(self.map_config_to_modality(cfg) == "video" and p > 0 + if any(self.map_config_to_modality(cfg) == "video" and p > 0 for cfg, p in bucket_config.items()): raise NotImplementedError("Video sampling not implemented; " "set its probability to 0.") @@ -898,7 +898,7 @@ def sample( ]) if enable_multimodal_chat: - # NOTE: For now this option is only provided for completeness + # NOTE: For now this option is only provided for completeness # given that the serve.py benchmark currently does not use it. mm_chat_prompt: Any = prompt mm_chat_prompt = self.apply_multimodal_chat_transformation( @@ -983,11 +983,11 @@ def sample( skip_min_output_len_check=output_len is not None): continue - if image_path := entry.get("image"): - mm_content = process_image(image_path) - elif video_path := entry.get("video"): + if image_path := entry.get("image"): + mm_content = process_image(image_path) + elif video_path := entry.get("video"): mm_content = process_video(video_path) - else: + else: mm_content = None if enable_multimodal_chat: prompt = self.apply_multimodal_chat_transformation( @@ -1008,6 +1008,7 @@ def sample( def add_dataset_parser(parser: FlexibleArgumentParser): parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--request-id-prefix", type=str, default="", help="The prefix for the request id.") parser.add_argument( "--num-prompts", type=int, @@ -1627,7 +1628,7 @@ def sample( expected_output_len=output_len, request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix) return sampled_requests @@ -1933,7 +1934,7 @@ def sample(self, request_id=request_id_prefix + str(ind), )) ind += 1 - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix) return sampled_requests @@ -1992,7 +1993,7 @@ def sample( multi_modal_data=mm_content, request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix) return sampled_requests @@ -2053,7 +2054,7 @@ def sample(self, expected_output_len=output_len, request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix) return sampled_requests @@ -2114,7 +2115,7 @@ def sample( expected_output_len=output_len, request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix) return sampled_requests @@ -2254,7 +2255,7 @@ def sample(self, expected_output_len=output_len, multi_modal_data=None, request_id=request_id_prefix + str(ind), - + )) ind += 1 self.maybe_oversample_requests(sampled_requests, num_requests, @@ -2435,7 +2436,7 @@ def sample( " what Whisper supports.", skipped, ) - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix) return sampled_requests @@ -2519,7 +2520,7 @@ def sample( ) ind += 1 - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix) return sampled_requests @@ -2530,7 +2531,7 @@ def sample( class PrefixRepetitionRandomDataset(BenchmarkDataset): - # Default values copied from benchmark_serving.py for the repeated prefix + # Default values copied from benchmark_serving.py for the repeated prefix # dataset. DEFAULT_PREFIX_LEN = 256 DEFAULT_SUFFIX_LEN = 256 diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 41d9fcb824b0..cbd15abe643b 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -32,11 +32,11 @@ def ignore_torch_compile(cls: _T) -> _T: a support_torch_compile decorator, but we don't want to compile the class `cls` that inherits the parent class. This only ignores compiling the forward of the class the - decorator is applied to. + decorator is applied to. If the parent has ignore_torch_compile but the child has support_torch_compile, the child will still be compiled. - + If the class has one or more submodules that have support_torch_compile decorator applied, compile will not be ignored for those submodules. diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 4f4673ac6e67..5b84621c9611 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1996,6 +1996,14 @@ class SpeculativeConfig: Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" + # Draft model vocabulary configuration + draft_vocab_frequency_path: Optional[str] = None + """The path to the scores used to prune the draft model vocabulary. + Can be the token frequencies or something else.""" + draft_vocab_frequency_keep_threshold: Optional[float] = None + """If `None`, do not prune the vocabulary. Otherwise must be in (0, 1) + and keep the threshold% most frequent tokens.""" + # Advanced control disable_by_batch_size: Optional[int] = None """Disable speculative decoding for new incoming requests when the number @@ -2464,7 +2472,7 @@ class LoRAConfig: lora_dtype: Union[torch.dtype, LoRADType] = "auto" """Data type for LoRA. If auto, will default to base model dtype.""" lora_extra_vocab_size: int = 256 - """(Deprecated) Maximum size of extra vocabulary that can be present in a + """(Deprecated) Maximum size of extra vocabulary that can be present in a LoRA adapter. Will be removed in v0.12.0.""" lora_vocab_padding_size: ClassVar[int] = current_platform\ .get_lora_vocab_padding_size() @@ -2698,8 +2706,8 @@ class PoolerConfig: """ Maximum input length allowed for embedding generation. When set, allows inputs longer than max_embed_len to be accepted for embedding models. - When an input exceeds max_embed_len, it will be handled according to - the original max_model_len validation logic. + When an input exceeds max_embed_len, it will be handled according to + the original max_model_len validation logic. Defaults to None (i.e. set to max_model_len). """ diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e6fd61ae1aad..cb001dec417a 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -63,6 +63,12 @@ _R = TypeVar("_R", default=Any) +def save_stats(forward_time, input_ids_shape, model_type): + import pathlib + outputs_dir = pathlib.Path("outputs/") + latest_dir = max([d for d in outputs_dir.iterdir() if d.is_dir()], key=lambda x: x.name, default=None) + with open(latest_dir / f"{model_type}.csv", 'a') as f: + print(f"{forward_time},{input_ids_shape}", file=f) class LLM: """An LLM for generating texts from given prompts and sampling parameters. @@ -1502,6 +1508,8 @@ def _run_engine( step_outputs = self.llm_engine.step() for output in step_outputs: if output.finished: + # save_stats(None, None, "drafter") + # save_stats(None, None, "target") outputs.append(output) if use_tqdm: if isinstance(output, RequestOutput): diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index 55dfe8088c8f..47012595d5fd 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -6,6 +6,8 @@ from vllm.model_executor.sampling_metadata import (SamplingMetadata, SamplingMetadataCache) from vllm.model_executor.utils import set_random_seed +from icecream import install +install() __all__ = [ "SamplingMetadata", diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 3d5e59addfcf..16b369f0e639 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -17,7 +17,9 @@ GREEDY_TEMPERATURE: tl.constexpr = -1 # Maximum number of speculative draft tokens allowed per request in a single # step. This value is chosen to be large enough to handle typical use cases. -MAX_SPEC_LEN = 32 +# For example, in FR-SPEC (https://github.com/thunlp/FR-Spec/tree/main) +# they get a huge speedup with 60 speculative tokens +MAX_SPEC_LEN = 64 class RejectionSampler(nn.Module): @@ -82,6 +84,7 @@ def forward( output_token_ids (torch.Tensor): A tensor containing the final output token IDs. ''' + # ic(draft_probs.shape if draft_probs else draft_probs) assert metadata.max_spec_len <= MAX_SPEC_LEN # [num_tokens, vocab_size] # NOTE(woosuk): `target_logits` can be updated in place inside the @@ -163,6 +166,8 @@ def rejection_sample( assert bonus_token_ids.is_contiguous() assert target_probs.shape == (num_tokens, vocab_size) + # ic(batch_size, num_tokens, vocab_size) + # Create output buffer. output_token_ids = torch.empty( (batch_size, max_spec_len + 1), diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index bf25c91d8390..411183bfb3bd 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast +import copy +import time from dataclasses import replace from importlib.util import find_spec from typing import Optional, Protocol @@ -8,6 +10,7 @@ import numpy as np import torch import torch.nn as nn +from huggingface_hub import hf_hub_download from vllm.attention.layer import Attention from vllm.config import (CompilationLevel, VllmConfig, @@ -32,6 +35,12 @@ PADDING_SLOT_ID = -1 +def save_stats(forward_time, input_ids_shape): + import pathlib + outputs_dir = pathlib.Path("outputs/") + latest_dir = max([d for d in outputs_dir.iterdir() if d.is_dir()], key=lambda x: x.name, default=None) + with open(latest_dir / "drafter.csv", 'a') as f: + print(f"{forward_time},{input_ids_shape}", file=f) class EagleAttentionMetadata(Protocol): # Required attributes @@ -71,6 +80,9 @@ def __init__( # hidden size (e.g., Llama 3.3 70B). self.hidden_size = self.draft_model_config.get_hidden_size() + # for pruning the draft model vocabulary + self.pruned_vocab = None + self.is_multimodal_model = vllm_config.model_config \ .is_multimodal_model @@ -145,6 +157,7 @@ def __init__( dtype=torch.int32, ).repeat(max_batch_size, 1) + def propose( self, # [num_tokens] @@ -212,12 +225,16 @@ def propose( with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=num_input_tokens): + st = time.perf_counter() ret_hidden_states = self.model( input_ids=input_ids, positions=self.positions[:num_input_tokens], hidden_states=self.hidden_states[:num_input_tokens], inputs_embeds=inputs_embeds, ) + et = time.perf_counter() + save_stats(et - st, input_ids.shape) + if self.method in ("deepseek_mtp", "ernie_mtp"): last_hidden_states = ret_hidden_states hidden_states = last_hidden_states @@ -237,11 +254,15 @@ def propose( hidden_states=hidden_states, common_attn_metadata=common_attn_metadata, ) + # [batch_size, num_tree_tokens] return torch.cat(draft_token_ids_list, dim=1) draft_token_ids = logits.argmax(dim=-1) + if self.vllm_config.speculative_config.draft_vocab_frequency_path is not None: + draft_token_ids = self.pruned_vocab[draft_token_ids] + # Early exit if there is only one draft token to be generated. if self.num_speculative_tokens == 1: # [batch_size, 1] @@ -322,16 +343,22 @@ def propose( with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=input_batch_size): + st = time.perf_counter() last_hidden_states, hidden_states = self.model( input_ids=input_ids, positions=self.positions[:input_batch_size], hidden_states=self.hidden_states[:input_batch_size], inputs_embeds=inputs_embeds, ) + et = time.perf_counter() + save_stats(et - st, input_ids.shape) + hidden_states = hidden_states[:batch_size] logits = self.model.compute_logits(last_hidden_states[:batch_size], None) draft_token_ids = logits.argmax(dim=-1) + if self.vllm_config.speculative_config.draft_vocab_frequency_path is not None: + draft_token_ids = self.pruned_vocab[draft_token_ids] draft_token_ids_list.append(draft_token_ids) # [batch_size, num_speculative_tokens] @@ -363,6 +390,10 @@ def propose_tree( else: draft_token_ids = torch.topk(logits, num_children, dim=-1).indices.view(batch_size, -1) + + if self.vllm_config.speculative_config.draft_vocab_frequency_path is not None: + draft_token_ids = self.pruned_vocab[draft_token_ids] + draft_token_ids_list = [draft_token_ids] draft_hidden_states = hidden_states.view(batch_size, 1, -1) @@ -471,12 +502,15 @@ def propose_tree( with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=num_input_tokens): + st = time.perf_counter() last_hidden_states, hidden_states = self.model( input_ids=self.input_ids[:num_input_tokens], positions=self.positions[:num_input_tokens], hidden_states=self.hidden_states[:num_input_tokens], inputs_embeds=None, ) + et = time.perf_counter() + save_stats(et - st, input_ids.shape) # Get the output hidden states for the draft tokens. draft_hidden_states = hidden_states[:num_tokens].view( @@ -499,6 +533,9 @@ def propose_tree( draft_token_ids = torch.topk(logits, num_children, dim=-1).indices.view( batch_size, -1) + if self.vllm_config.speculative_config.draft_vocab_frequency_path is not None: + draft_token_ids = self.pruned_vocab[draft_token_ids] + draft_token_ids_list.append(draft_token_ids) # Update the # drafts counters for the next tree level. @@ -648,8 +685,60 @@ def load_model(self, target_model: nn.Module) -> None: if self.vllm_config.speculative_config.method != "eagle3" and \ hasattr(target_language_model, "lm_head"): logger.info("Loading EAGLE LM head weights from the target model.") + self.model.lm_head = target_language_model.lm_head + # Prune the draft model vocabulary + if self.vllm_config.speculative_config.draft_vocab_frequency_path is not None: + + vocab_freq_path = self.vllm_config.speculative_config.draft_vocab_frequency_path + keep_threshold = self.vllm_config.speculative_config.draft_vocab_frequency_keep_threshold + + if keep_threshold is None: + raise ValueError( + "When `draft_vocab_frequency_path` is set, " + "`draft_vocab_frequency_keep_threshold` cannot be None." + ) + + logger.info(f"Loading draft model vocabulary scores from {vocab_freq_path}") + vocab_freq = load_vocab_freq(vocab_freq_path) + + logger.info(f"Keep {keep_threshold}% of the draft vocabulary.") + self.pruned_vocab = prune_draft_vocab(vocab_freq, keep_threshold) + self.pruned_vocab = self.pruned_vocab.to(self.model.lm_head.weight.device) + + # Update lm_head weights with pruned vocabulary + if hasattr(self.model, "lm_head"): + ic(self.model.lm_head.weight.shape, target_language_model.lm_head.weight.shape) + ic(torch.cuda.memory_summary()) + + # to prune the vocab, the draft lm_head cannot be shared with the target model lm_head + if self.model.lm_head == target_language_model.lm_head: + self.model.lm_head = copy.deepcopy(self.model.lm_head) + + ic(self.model.lm_head.weight.shape, target_language_model.lm_head.weight.shape) + ic(torch.cuda.memory_summary()) + + # Keep old weight reference to allow memory release + old_weight = self.model.lm_head.weight + + # In-place pruning of the weight + self.model.lm_head.weight.data = self.model.lm_head.weight.data[self.pruned_vocab].clone().detach() + + # Free old memory + del old_weight + torch.cuda.empty_cache() + torch.cuda.synchronize() + + ic(self.model.lm_head.weight.shape, target_language_model.lm_head.weight.shape) + print(torch.cuda.memory_summary()) + + + elif hasattr(self.model.model, "embed_tokens"): + logger.info("Assuming lm_head is tied to embed_tokens; skipping direct weight update.") + else: + logger.warning("No lm_head or embed_tokens found; pruned vocabulary not applied.") + @torch.inference_mode() def dummy_run( self, @@ -691,6 +780,61 @@ def validate_same_kv_cache_group(self, ) == 1, "All eagle layers should belong to the same kv cache group" +def load_vocab_freq(vocab_frequency_path: str) -> torch.Tensor: + """ + Load vocabulary frequencies from a Hugging Face dataset file. + + Args: + vocab_frequency_path: HF path in the form 'username/repo/file.pt' + + Returns: + Tensor of integer vocabulary frequencies. + """ + if not vocab_frequency_path: + raise ValueError("`vocab_frequency_path` must be provided.") + + # Parse HF path + parts = vocab_frequency_path.split("/") + if len(parts) < 3: + raise ValueError("HF path must be at least 'username/repo/file.pt'") + repo_id = "/".join(parts[:2]) + file_path_in_repo = "/".join(parts[2:]) + + # Download the file + try: + local_path = hf_hub_download(repo_id=repo_id, filename=file_path_in_repo, repo_type="dataset") + except Exception as e: + local_path = hf_hub_download(repo_id=repo_id, filename=file_path_in_repo) + + # Load as a tensor of integers + vocab_freq = torch.load(local_path, weights_only=True) + vocab_freq = torch.tensor(vocab_freq).to(torch.int64) + return vocab_freq + + +def prune_draft_vocab(vocab_freq: torch.Tensor, keep_threshold: float) -> torch.Tensor: + """ + Prune a draft vocabulary based on the keep threshold. + + Args: + vocab_freq: Tensor of vocabulary frequencies. + keep_threshold: Fraction of cumulative mass to retain (0 < keep_threshold < 1). + + Returns: + Tensor of indices representing the pruned vocabulary. + """ + if not isinstance(vocab_freq, torch.Tensor): + raise TypeError("`vocab_freq` must be a torch.Tensor.") + if not (0 <= keep_threshold <= 1): + raise ValueError(f"`keep_threshold` must be in [0, 1], got {keep_threshold}") + + # Sort frequencies descending + _, sorted_indices = torch.sort(vocab_freq, descending=True) + cutoff_idx = int(keep_threshold * len(sorted_indices)) + pruned_vocab = sorted_indices[:cutoff_idx] + return pruned_vocab + + # NOTE(woosuk): Currently, the below code is not used and we always use argmax # to sample the draft tokens. We will use this after we find a way to manage # the draft prob tensor. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 897c3a621320..5d4b3b169401 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4,6 +4,7 @@ import gc import itertools import time +import os from collections import defaultdict from collections.abc import Iterator from contextlib import contextmanager @@ -100,6 +101,12 @@ logger = init_logger(__name__) +def save_stats(forward_time, input_ids_shape, model_type): + import pathlib + outputs_dir = pathlib.Path("outputs/") + latest_dir = max([d for d in outputs_dir.iterdir() if d.is_dir()], key=lambda x: x.name, default=None) + with open(latest_dir / f"{model_type}.csv", 'a') as f: + print(f"{forward_time},{input_ids_shape}", file=f) # Wrapper for ModelRunnerOutput to support overlapped execution. class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): @@ -131,7 +138,7 @@ def __init__( def get_output(self) -> ModelRunnerOutput: """Copy the device tensors to the host and return a ModelRunnerOutput. - + This function blocks until the copy is finished. """ self._async_copy_ready_event.synchronize() @@ -377,6 +384,10 @@ def __init__( device="cpu", pin_memory=self.pin_memory) + # todo: delete + self.forward_times = [] + self.input_ids_shapes = [] + def _make_buffer(self, *size: Union[int, torch.SymInt], dtype: torch.dtype, @@ -723,7 +734,7 @@ def _get_cumsum_and_arange( def _prepare_input_ids(self, total_num_scheduled_tokens: int, cu_num_tokens: np.ndarray) -> None: """Prepare the input IDs for the current batch. - + Carefully handles the `prev_sampled_token_ids` which can be cached from the previous engine iteration, in which case those tokens on the GPU need to be copied into the corresponding slots into input_ids.""" @@ -1709,6 +1720,7 @@ def _sample( sampling_metadata, ) sampler_output.sampled_token_ids = output_token_ids + # ic(target_logits.shape, bonus_token_ids.shape) return sampler_output @@ -1900,6 +1912,7 @@ def execute_model( ), record_function_or_nullcontext("Forward"), self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output): + st = time.perf_counter() model_output = self.model( input_ids=input_ids, positions=positions, @@ -1907,6 +1920,8 @@ def execute_model( inputs_embeds=inputs_embeds, **model_kwargs, ) + et = time.perf_counter() + save_stats(et - st, input_ids.shape, "target") with record_function_or_nullcontext("Postprocess"): if self.use_aux_hidden_state_outputs: @@ -2097,6 +2112,7 @@ def propose_draft_token_ids( ] num_rejected_tokens_cpu = torch.tensor(num_rejected_tokens, dtype=torch.int32) + # ic(num_rejected_tokens_cpu) common_attn_metadata, token_indices =\ self.drafter.prepare_inputs( common_attn_metadata, num_rejected_tokens_cpu) @@ -2218,7 +2234,10 @@ def load_model(self, eep_scale_up: bool = False) -> None: self.device) if hasattr(self, "drafter"): logger.info("Loading drafter model...") + # st = time.perf_counter() self.drafter.load_model(self.model) + # et = time.perf_counter() + # ic(f"time to load drafter: {et-st:.2f}") if self.use_aux_hidden_state_outputs: if supports_eagle3(self.model): self.model.set_aux_hidden_state_layers(