diff --git a/docs/backend/speculative_decoding.ipynb b/docs/backend/speculative_decoding.ipynb index afec9c4d51b7..011d8030b6a0 100644 --- a/docs/backend/speculative_decoding.ipynb +++ b/docs/backend/speculative_decoding.ipynb @@ -150,18 +150,11 @@ "source": [ "### EAGLE Decoding via Frequency-Ranked Speculative Sampling\n", "\n", - "By employing a truncated high-frequency token vocabulary in the draft model, Eagle speculative decoding reduces lm_head computational overhead while accelerating the pipeline without quality degradation.For more details checkout [this paper](https://arxiv.org/pdf/arXiv:2502.14856)\n", + "By employing a truncated high-frequency token vocabulary in the draft model, Eagle speculative decoding reduces lm_head computational overhead while accelerating the pipeline without quality degradation. For more details checkout [this paper](https://arxiv.org/pdf/arXiv:2502.14856)\n", "\n", - "Set `--speculative-token-map` to use this optimization. You can get the high-frequency token in FR-Spec from https://huggingface.co/thunlp/LLaMA3-Instruct-8B-FR-Spec. Or you can obtain high-frequency token by yourself.\n", - "+ Execute inference on your dataset using sglang's standard inference mode and persist the outputs.\n", - "+ Extract the top-k high-frequency tokens from the saved file. There is a reference implementation (https://gist.github.com/Zhou-sx/71a9196d2f324c93f79016579fdf57da). \n" + "Set `--speculative-token-map` to use this optimization. You can get the high-frequency token in FR-Spec from https://huggingface.co/thunlp/LLaMA3-Instruct-8B-FR-Spec. Or you can obtain high-frequency token by yourself (https://github.com/thunlp/FR-Spec/tree/main?tab=readme-ov-file#prepare-fr-spec-vocabulary-subset).\n" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -179,9 +172,9 @@ "\n", "server_process, port = launch_server_cmd(\n", " \"\"\"\n", - "python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --speculative-algo EAGLE \\\n", - " --speculative-draft lmzheng/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 3 \\\n", - " --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --speculative-token-map {hot_token_ids.pt} \n", + "python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algo EAGLE \\\n", + " --speculative-draft lmzheng/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n", + " --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map {hot_token_ids.pt} \n", "\"\"\"\n", ")\n", "\n", @@ -199,7 +192,7 @@ "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n", "\n", "response = client.chat.completions.create(\n", - " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n", " messages=[\n", " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n", " ],\n",