sgl-project · hnyls2002 · May 11, 2025 · May 10, 2025 · May 10, 2025 · May 10, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -33,6 +33,12 @@ repos:
     rev: 24.10.0
     hooks:
       - id: black-jupyter
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.4.1
+    hooks:
+      - id: codespell
+        additional_dependencies: ['tomli']
+        args: ['--toml', 'python/pyproject.toml']
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v18.1.8
     hooks:

diff --git a/3rdparty/amd/tuning/TUNING.md b/3rdparty/amd/tuning/TUNING.md
@@ -104,7 +104,7 @@ To maximize moe kernel efficiency, need to use below scripts to find out the bes
 
 ```bash
 #Tuning
-#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input lenth 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
+#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input length 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
 #so we can tune decode moe use below command
 python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32"
 # and use this command to tune prefill moe

diff --git a/benchmark/hicache/bench_serving.py b/benchmark/hicache/bench_serving.py
@@ -267,7 +267,7 @@ async def get_requests(
         try:
             request = await asyncio.wait_for(
                 input_requests_queue.get(), timeout=300
-            )  # Wait for 5 minites then abort
+            )  # Wait for 5 minutes then abort
         except Exception as e:
             print(f"exception: {e}")
             break
@@ -514,7 +514,7 @@ async def limited_request_func(request_func_input, queue, tokenizer, pbar):
     print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
     print(
         "{:<40} {:<10}".format(
-            "Max reqeuest concurrency:",
+            "Max request concurrency:",
             max_concurrency if max_concurrency else "not set",
         )
     )

diff --git a/benchmark/json_schema/bench_sglang.py b/benchmark/json_schema/bench_sglang.py
@@ -95,15 +95,15 @@ def bench_schema(args):
     latency = time.time() - tic
 
     # Check if the outputs are valid
-    indexs = []
+    indexes = []
     for i, state in enumerate(states):
         try:
             schema = json.loads(arguments[i]["json_schema"])
             obj = json.loads(state["json_output"])
             assert jsonschema.validate(obj, schema) is None
         except Exception as e:
             print(e)
-            indexs.append(i)
+            indexes.append(i)
 
     return states, latency
 

diff --git a/benchmark/line_retrieval/gen_data.py b/benchmark/line_retrieval/gen_data.py
@@ -15,7 +15,7 @@
 
 def generate_lines(random_words, num_lines, redirect_ratio):
     prefix = "Here is a list of lines, each with its corresponding REGISTER_CONTENT value. Please memorize them. Be prepared to provide the REGISTER_CONTENT value for a specific line index when I ask."
-    suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resovling the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is"
+    suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resolving the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is"
 
     # Raw lines
     visited_indices = set([None])

diff --git a/benchmark/multi_document_qa/bench_other.py b/benchmark/multi_document_qa/bench_other.py
@@ -17,7 +17,7 @@
 
 def multi_document_qa(docs, question, generate):
     s = USER_PREFIX
-    s += "Pleaes answer a question according to given documents.\n"
+    s += "Please answer a question according to given documents.\n"
     s += "Question:" + question + "Documents begin.\n"
 
     s += "".join(docs)

diff --git a/benchmark/multi_document_qa/bench_sglang.py b/benchmark/multi_document_qa/bench_sglang.py
@@ -13,7 +13,7 @@
 @sgl.function
 def multi_document_qa(s, docs, question):
     s += sgl.user_begin()
-    s += "Pleaes answer a question according to given documents.\n"
+    s += "Please answer a question according to given documents.\n"
     s += "Question:" + question + "Documents begin.\n"
 
     forks = s.fork(len(docs))

diff --git a/docs/backend/function_calling.ipynb b/docs/backend/function_calling.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Tool and Function Calling\n",
     "\n",
-    "This guide demonstrates how to use SGLang’s [Funcion calling](https://platform.openai.com/docs/guides/function-calling) functionality."
+    "This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality."
    ]
   },
   {
@@ -399,7 +399,7 @@
     "    },\n",
     "}\n",
     "gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
-    "print_highlight(\"==== Reponse ====\")\n",
+    "print_highlight(\"==== Response ====\")\n",
     "print(gen_response)\n",
     "\n",
     "# parse the response\n",

diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
@@ -275,7 +275,7 @@
    "source": [
     "## Structured Outputs (JSON, Regex, EBNF)\n",
     "\n",
-    "For OpenAI compatible structed outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n"
+    "For OpenAI compatible structured outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n"
    ]
   },
   {

diff --git a/docs/backend/sampling_params.md b/docs/backend/sampling_params.md
@@ -40,7 +40,7 @@ The `/generate` endpoint accepts the following parameters in JSON format. For de
 | Argument           | Type/Default           | Description                                                                                                                                    |
 |--------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
 | frequency_penalty  | `float = 0.0`          | Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. |
-| presence_penalty   | `float = 0.0`          | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occured. |
+| presence_penalty   | `float = 0.0`          | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occurred. |
 | min_new_tokens     | `int = 0`              | Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior, for example, if the distribution is highly skewed towards these tokens. |
 
 ### Constrained decoding

diff --git a/docs/backend/send_request.ipynb b/docs/backend/send_request.ipynb
@@ -166,7 +166,7 @@
    "source": [
     "## Using Native Generation APIs\n",
     "\n",
-    "You can also use the native `/generate` endpoint with requests, which provides more flexiblity. An API reference is available at [Sampling Parameters](sampling_params.md)."
+    "You can also use the native `/generate` endpoint with requests, which provides more flexibility. An API reference is available at [Sampling Parameters](sampling_params.md)."
    ]
   },
   {

diff --git a/docs/backend/separate_reasoning.ipynb b/docs/backend/separate_reasoning.ipynb
@@ -378,7 +378,7 @@
     "\n",
     "    Args:\n",
     "        model_type (str): Type of model to parse reasoning from\n",
-    "        stream_reasoning (bool): If Flase, accumulates reasoning content until complete.\n",
+    "        stream_reasoning (bool): If False, accumulates reasoning content until complete.\n",
     "            If True, streams reasoning content as it arrives.\n",
     "    \"\"\"\n",
     "\n",

diff --git a/docs/backend/speculative_decoding.ipynb b/docs/backend/speculative_decoding.ipynb
@@ -11,7 +11,7 @@
     "\n",
     "### Performance Highlights\n",
     "\n",
-    "Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be archieved via EAGLE3 decoding.\n",
+    "Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be achieved via EAGLE3 decoding.\n",
     "For further details please see the [EAGLE3 paper](https://arxiv.org/pdf/2503.01840).\n",
     "\n",
     "| Method | Throughput (tokens/s) |\n",
@@ -296,7 +296,7 @@
     "- EAGLE-2 additionally uses the draft model to evaluate how probable certain branches in the draft tree are, dynamically stopping the expansion of unlikely branches. After the expansion phase, reranking is employed to select only the top `speculative_num_draft_tokens` final nodes as draft tokens.\n",
     "- EAGLE-3 removes the feature prediction objective, incorporates low and mid-layer features, and is trained in an on-policy manner.\n",
     "\n",
-    "This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionaly to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n",
+    "This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionally to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n",
     "\n",
     "\n",
     "For guidance how to train your own EAGLE model please see the [EAGLE repo](https://github.com/SafeAILab/EAGLE/tree/main?tab=readme-ov-file#train)."

diff --git a/docs/developer/development_guide_using_docker.md b/docs/developer/development_guide_using_docker.md
@@ -52,7 +52,7 @@ docker run -itd --shm-size 32g --gpus all -v <volumes-to-mount> --ipc=host --net
 docker exec -it sglang_dev /bin/zsh
 ```
 Some useful volumes to mount are:
-1. **Huggingface model cache**: mounting model cache can avoid re-download everytime docker restarts. Default location on Linux is `~/.cache/huggingface/`.
+1. **Huggingface model cache**: mounting model cache can avoid re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`.
 2. **SGLang repository**: code changes in the SGLang local repository will be automatically synced to the .devcontainer.
 
 Example 1: Monting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer.

diff --git a/docs/developer/setup_github_runner.md b/docs/developer/setup_github_runner.md
@@ -29,7 +29,7 @@ Then follow https://github.com/sgl-project/sglang/settings/actions/runners/new?a
 
 **Notes**
 - Do not need to specify the runner group
-- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be editted later in Github Settings.
+- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be edited later in Github Settings.
 - Do not need to change the work folder.
 
 ### Step 3: Run the runner by `run.sh`

diff --git a/docs/router/router.md b/docs/router/router.md
@@ -32,7 +32,7 @@ python -m sglang_router.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-
 
 After the server is ready, you can directly send requests to the router as the same way as sending requests to each single worker.
 
-Please adjust the batchsize accordingly to archieve maximum throughput.
+Please adjust the batchsize accordingly to achieve maximum throughput.
 
 ```python
 import requests

diff --git a/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb b/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
@@ -375,7 +375,7 @@
     "\n",
     "When opening above experiment, we will see an overview of the experiment as shown below. The upper half shows a summary of the statistics on the left and charts to investigate the distribution and relationships of scores on the right. The lower half is a table with the individual traces which we can use to debug individual samples.\n",
     "\n",
-    "When looking at the statistics, we can see that the accuracy of our RAG pipeline is 22% as measured by `answer_matches_target_llm_grader`. Though when checking the quality of our retrieval step (`context_query_relevancy`), we can see that our retrival step is fetching relevant information in only 27% of all samples. As shown in the GIF, we investigate the relationship between the two and see the two scores have 95% agreement. This confirms that the retrieval step is a major bottleneck for our RAG pipeline. So, now it's your turn to improve the retrieval step!\n",
+    "When looking at the statistics, we can see that the accuracy of our RAG pipeline is 22% as measured by `answer_matches_target_llm_grader`. Though when checking the quality of our retrieval step (`context_query_relevancy`), we can see that our retrieval step is fetching relevant information in only 27% of all samples. As shown in the GIF, we investigate the relationship between the two and see the two scores have 95% agreement. This confirms that the retrieval step is a major bottleneck for our RAG pipeline. So, now it's your turn to improve the retrieval step!\n",
     "\n",
     "Note, above link isn't publicly accessible but the experiment can be accessed through [here](https://app.parea.ai/public-experiments/parea/rag_sglang/30f0244a-d56c-44ff-bdfb-8f47626304b6).\n",
     "\n",

@@ -147,3 +147,7 @@ exclude = [
     "scripts*",
     "tests*",
 ]
+
+[tool.codespell]
+ignore-words-list = "ans, hel, boostrap"
+skip = "*.json,*.jsonl,*.patch"
@@ -315,7 +315,7 @@ def throughput_test(
     tokenizer_id = server_args.tokenizer_path or server_args.model_path
     tokenizer = get_tokenizer(tokenizer_id)
 
-    # Set global environmnets
+    # Set global environments
     set_ulimit()
     random.seed(bench_args.seed)
     np.random.seed(bench_args.seed)

@@ -1263,7 +1263,7 @@ async def limited_request_func(request_func_input, pbar):
     print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
     print(
         "{:<40} {:<10}".format(
-            "Max reqeuest concurrency:",
+            "Max request concurrency:",
             max_concurrency if max_concurrency else "not set",
         )
     )

@@ -129,7 +129,7 @@ def launch_server_process_and_send_one_request(
 
 
 def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
-    # Disbale cuda graph and torch compile to save time
+    # Disable cuda graph and torch compile to save time
     server_args.disable_cuda_graph = True
     server_args.enable_torch_compile = False
     print(f"Disable CUDA Graph and Torch Compile to save time...")

diff --git a/python/sglang/lang/tracer.py b/python/sglang/lang/tracer.py
@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
         with TracingScope(tracer):
             tracer.ret_value = program.func(tracer, **arguments)
     except (StopTracing, TypeError, AttributeError):
-        # Some exceptions may not be catched
+        # Some exceptions may not be caught
         pass
 
     # Run and cache prefix

diff --git a/python/sglang/srt/code_completion_parser.py b/python/sglang/srt/code_completion_parser.py
@@ -27,7 +27,7 @@
 
 
 class FimPosition:
-    """Postion of fim middle token."""
+    """Position of fim middle token."""
 
     MIDDLE = auto()
     END = auto()
@@ -145,9 +145,8 @@ def generate_completion_prompt(prompt: str, suffix: str, template_name: str) ->
 register_completion_template(
     CompletionTemplate(
         name="deepseek_coder",
-        fim_begin_token="<｜fim▁begin｜>",
-        fim_middle_token="<｜fim▁hole｜>",
-        fim_end_token="<｜fim▁end｜>",
+        fim_begin_token="",
+        fim_middle_token="",
         fim_position=FimPosition.MIDDLE,
     )
 )

diff --git a/python/sglang/srt/configs/deepseekvl2.py b/python/sglang/srt/configs/deepseekvl2.py
@@ -413,9 +413,9 @@ def tokenize_with_images(
             h = w = math.ceil(
                 (self.image_size // self.patch_size) / self.downsample_ratio
             )
-            # global views tokens h * (w + 1), 1 is for line seperator
+            # global views tokens h * (w + 1), 1 is for line separator
             tokenized_image = [self.image_token_id] * h * (w + 1)
-            # add a seperator between global and local views
+            # add a separator between global and local views
             tokenized_image += [self.image_token_id]
             # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
             tokenized_image += (

@@ -509,7 +509,7 @@ def event_loop_normal_disagg_decode(self: Scheduler):
     def event_loop_overlap_disagg_decode(self: Scheduler):
         result_queue = deque()
         self.last_batch: Optional[ScheduleBatch] = None
-        self.last_batch_in_queue = False  # last batch is modifed in-place, so we need another variable to track if it's extend
+        self.last_batch_in_queue = False  # last batch is modified in-place, so we need another variable to track if it's extend
 
         while True:
             recv_reqs = self.recv_requests()

@@ -54,7 +54,7 @@ def send(
             logger.info(f"FakeKVSender send success")
         else:
             self.has_sent = False
-            logger.info(f"FakeKVSender send fake transfering")
+            logger.info(f"FakeKVSender send fake transferring")
 
     def failure_exception(self):
         raise Exception("Fake KVSender Exception")

@@ -363,7 +363,7 @@ def add_transfer_request(
         self.request_status[bootstrap_room] = KVPoll.WaitingForInput
 
     def check_status(self, bootstrap_room: int):
-        # TOOD: do we really need the poll()?
+        # TODO: do we really need the poll()?
 
         return self.request_status[bootstrap_room]
 

@@ -112,7 +112,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
 
 
 def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
-    # 1. The page is guaruanteed to be full except the last page.
+    # 1. The page is guaranteed to be full except the last page.
     # 2. page index = kv_index // page_size
     # The return vector is kv_indices[::page_size] // page_size
     if page_size == 1:  # shortcut