Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ repos:
rev: 24.10.0
hooks:
- id: black-jupyter
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
hooks:
- id: codespell
additional_dependencies: ['tomli']
args: ['--toml', 'python/pyproject.toml']
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.8
hooks:
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/amd/tuning/TUNING.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ To maximize moe kernel efficiency, need to use below scripts to find out the bes

```bash
#Tuning
#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input lenth 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input length 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
#so we can tune decode moe use below command
python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32"
# and use this command to tune prefill moe
Expand Down
4 changes: 2 additions & 2 deletions benchmark/hicache/bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ async def get_requests(
try:
request = await asyncio.wait_for(
input_requests_queue.get(), timeout=300
) # Wait for 5 minites then abort
) # Wait for 5 minutes then abort
except Exception as e:
print(f"exception: {e}")
break
Expand Down Expand Up @@ -514,7 +514,7 @@ async def limited_request_func(request_func_input, queue, tokenizer, pbar):
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
print(
"{:<40} {:<10}".format(
"Max reqeuest concurrency:",
"Max request concurrency:",
max_concurrency if max_concurrency else "not set",
)
)
Expand Down
4 changes: 2 additions & 2 deletions benchmark/json_schema/bench_sglang.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,15 @@ def bench_schema(args):
latency = time.time() - tic

# Check if the outputs are valid
indexs = []
indexes = []
for i, state in enumerate(states):
try:
schema = json.loads(arguments[i]["json_schema"])
obj = json.loads(state["json_output"])
assert jsonschema.validate(obj, schema) is None
except Exception as e:
print(e)
indexs.append(i)
indexes.append(i)

return states, latency

Expand Down
2 changes: 1 addition & 1 deletion benchmark/line_retrieval/gen_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

def generate_lines(random_words, num_lines, redirect_ratio):
prefix = "Here is a list of lines, each with its corresponding REGISTER_CONTENT value. Please memorize them. Be prepared to provide the REGISTER_CONTENT value for a specific line index when I ask."
suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resovling the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is"
suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resolving the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is"

# Raw lines
visited_indices = set([None])
Expand Down
2 changes: 1 addition & 1 deletion benchmark/multi_document_qa/bench_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

def multi_document_qa(docs, question, generate):
s = USER_PREFIX
s += "Pleaes answer a question according to given documents.\n"
s += "Please answer a question according to given documents.\n"
s += "Question:" + question + "Documents begin.\n"

s += "".join(docs)
Expand Down
2 changes: 1 addition & 1 deletion benchmark/multi_document_qa/bench_sglang.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
@sgl.function
def multi_document_qa(s, docs, question):
s += sgl.user_begin()
s += "Pleaes answer a question according to given documents.\n"
s += "Please answer a question according to given documents.\n"
s += "Question:" + question + "Documents begin.\n"

forks = s.fork(len(docs))
Expand Down
4 changes: 2 additions & 2 deletions docs/backend/function_calling.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"source": [
"# Tool and Function Calling\n",
"\n",
"This guide demonstrates how to use SGLang’s [Funcion calling](https://platform.openai.com/docs/guides/function-calling) functionality."
"This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality."
]
},
{
Expand Down Expand Up @@ -399,7 +399,7 @@
" },\n",
"}\n",
"gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
"print_highlight(\"==== Reponse ====\")\n",
"print_highlight(\"==== Response ====\")\n",
"print(gen_response)\n",
"\n",
"# parse the response\n",
Expand Down
2 changes: 1 addition & 1 deletion docs/backend/openai_api_completions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@
"source": [
"## Structured Outputs (JSON, Regex, EBNF)\n",
"\n",
"For OpenAI compatible structed outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n"
"For OpenAI compatible structured outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion docs/backend/sampling_params.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ The `/generate` endpoint accepts the following parameters in JSON format. For de
| Argument | Type/Default | Description |
|--------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
| frequency_penalty | `float = 0.0` | Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. |
| presence_penalty | `float = 0.0` | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occured. |
| presence_penalty | `float = 0.0` | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occurred. |
| min_new_tokens | `int = 0` | Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior, for example, if the distribution is highly skewed towards these tokens. |

### Constrained decoding
Expand Down
2 changes: 1 addition & 1 deletion docs/backend/send_request.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@
"source": [
"## Using Native Generation APIs\n",
"\n",
"You can also use the native `/generate` endpoint with requests, which provides more flexiblity. An API reference is available at [Sampling Parameters](sampling_params.md)."
"You can also use the native `/generate` endpoint with requests, which provides more flexibility. An API reference is available at [Sampling Parameters](sampling_params.md)."
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion docs/backend/separate_reasoning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@
"\n",
" Args:\n",
" model_type (str): Type of model to parse reasoning from\n",
" stream_reasoning (bool): If Flase, accumulates reasoning content until complete.\n",
" stream_reasoning (bool): If False, accumulates reasoning content until complete.\n",
" If True, streams reasoning content as it arrives.\n",
" \"\"\"\n",
"\n",
Expand Down
4 changes: 2 additions & 2 deletions docs/backend/speculative_decoding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"\n",
"### Performance Highlights\n",
"\n",
"Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be archieved via EAGLE3 decoding.\n",
"Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be achieved via EAGLE3 decoding.\n",
"For further details please see the [EAGLE3 paper](https://arxiv.org/pdf/2503.01840).\n",
"\n",
"| Method | Throughput (tokens/s) |\n",
Expand Down Expand Up @@ -296,7 +296,7 @@
"- EAGLE-2 additionally uses the draft model to evaluate how probable certain branches in the draft tree are, dynamically stopping the expansion of unlikely branches. After the expansion phase, reranking is employed to select only the top `speculative_num_draft_tokens` final nodes as draft tokens.\n",
"- EAGLE-3 removes the feature prediction objective, incorporates low and mid-layer features, and is trained in an on-policy manner.\n",
"\n",
"This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionaly to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n",
"This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionally to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n",
"\n",
"\n",
"For guidance how to train your own EAGLE model please see the [EAGLE repo](https://github.com/SafeAILab/EAGLE/tree/main?tab=readme-ov-file#train)."
Expand Down
2 changes: 1 addition & 1 deletion docs/developer/development_guide_using_docker.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ docker run -itd --shm-size 32g --gpus all -v <volumes-to-mount> --ipc=host --net
docker exec -it sglang_dev /bin/zsh
```
Some useful volumes to mount are:
1. **Huggingface model cache**: mounting model cache can avoid re-download everytime docker restarts. Default location on Linux is `~/.cache/huggingface/`.
1. **Huggingface model cache**: mounting model cache can avoid re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`.
2. **SGLang repository**: code changes in the SGLang local repository will be automatically synced to the .devcontainer.

Example 1: Monting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer.
Expand Down
2 changes: 1 addition & 1 deletion docs/developer/setup_github_runner.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Then follow https://github.com/sgl-project/sglang/settings/actions/runners/new?a

**Notes**
- Do not need to specify the runner group
- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be editted later in Github Settings.
- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be edited later in Github Settings.
- Do not need to change the work folder.

### Step 3: Run the runner by `run.sh`
Expand Down
2 changes: 1 addition & 1 deletion docs/router/router.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ python -m sglang_router.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-

After the server is ready, you can directly send requests to the router as the same way as sending requests to each single worker.

Please adjust the batchsize accordingly to archieve maximum throughput.
Please adjust the batchsize accordingly to achieve maximum throughput.

```python
import requests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@
"\n",
"When opening above experiment, we will see an overview of the experiment as shown below. The upper half shows a summary of the statistics on the left and charts to investigate the distribution and relationships of scores on the right. The lower half is a table with the individual traces which we can use to debug individual samples.\n",
"\n",
"When looking at the statistics, we can see that the accuracy of our RAG pipeline is 22% as measured by `answer_matches_target_llm_grader`. Though when checking the quality of our retrieval step (`context_query_relevancy`), we can see that our retrival step is fetching relevant information in only 27% of all samples. As shown in the GIF, we investigate the relationship between the two and see the two scores have 95% agreement. This confirms that the retrieval step is a major bottleneck for our RAG pipeline. So, now it's your turn to improve the retrieval step!\n",
"When looking at the statistics, we can see that the accuracy of our RAG pipeline is 22% as measured by `answer_matches_target_llm_grader`. Though when checking the quality of our retrieval step (`context_query_relevancy`), we can see that our retrieval step is fetching relevant information in only 27% of all samples. As shown in the GIF, we investigate the relationship between the two and see the two scores have 95% agreement. This confirms that the retrieval step is a major bottleneck for our RAG pipeline. So, now it's your turn to improve the retrieval step!\n",
"\n",
"Note, above link isn't publicly accessible but the experiment can be accessed through [here](https://app.parea.ai/public-experiments/parea/rag_sglang/30f0244a-d56c-44ff-bdfb-8f47626304b6).\n",
"\n",
Expand Down
4 changes: 4 additions & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,7 @@ exclude = [
"scripts*",
"tests*",
]

[tool.codespell]
ignore-words-list = "ans, hel, boostrap"
skip = "*.json,*.jsonl,*.patch"
2 changes: 1 addition & 1 deletion python/sglang/bench_offline_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def throughput_test(
tokenizer_id = server_args.tokenizer_path or server_args.model_path
tokenizer = get_tokenizer(tokenizer_id)

# Set global environmnets
# Set global environments
set_ulimit()
random.seed(bench_args.seed)
np.random.seed(bench_args.seed)
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -1263,7 +1263,7 @@ async def limited_request_func(request_func_input, pbar):
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
print(
"{:<40} {:<10}".format(
"Max reqeuest concurrency:",
"Max request concurrency:",
max_concurrency if max_concurrency else "not set",
)
)
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/compile_deep_gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def launch_server_process_and_send_one_request(


def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
# Disbale cuda graph and torch compile to save time
# Disable cuda graph and torch compile to save time
server_args.disable_cuda_graph = True
server_args.enable_torch_compile = False
print(f"Disable CUDA Graph and Torch Compile to save time...")
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/lang/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
with TracingScope(tracer):
tracer.ret_value = program.func(tracer, **arguments)
except (StopTracing, TypeError, AttributeError):
# Some exceptions may not be catched
# Some exceptions may not be caught
pass

# Run and cache prefix
Expand Down
7 changes: 3 additions & 4 deletions python/sglang/srt/code_completion_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


class FimPosition:
"""Postion of fim middle token."""
"""Position of fim middle token."""

MIDDLE = auto()
END = auto()
Expand Down Expand Up @@ -145,9 +145,8 @@ def generate_completion_prompt(prompt: str, suffix: str, template_name: str) ->
register_completion_template(
CompletionTemplate(
name="deepseek_coder",
fim_begin_token="<|fim▁begin|>",
fim_middle_token="<|fim▁hole|>",
fim_end_token="<|fim▁end|>",
fim_begin_token="",
fim_middle_token="",
fim_position=FimPosition.MIDDLE,
)
)
Expand Down
4 changes: 2 additions & 2 deletions python/sglang/srt/configs/deepseekvl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,9 +413,9 @@ def tokenize_with_images(
h = w = math.ceil(
(self.image_size // self.patch_size) / self.downsample_ratio
)
# global views tokens h * (w + 1), 1 is for line seperator
# global views tokens h * (w + 1), 1 is for line separator
tokenized_image = [self.image_token_id] * h * (w + 1)
# add a seperator between global and local views
# add a separator between global and local views
tokenized_image += [self.image_token_id]
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
tokenized_image += (
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/disaggregation/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ def event_loop_normal_disagg_decode(self: Scheduler):
def event_loop_overlap_disagg_decode(self: Scheduler):
result_queue = deque()
self.last_batch: Optional[ScheduleBatch] = None
self.last_batch_in_queue = False # last batch is modifed in-place, so we need another variable to track if it's extend
self.last_batch_in_queue = False # last batch is modified in-place, so we need another variable to track if it's extend

while True:
recv_reqs = self.recv_requests()
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/disaggregation/fake/conn.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def send(
logger.info(f"FakeKVSender send success")
else:
self.has_sent = False
logger.info(f"FakeKVSender send fake transfering")
logger.info(f"FakeKVSender send fake transferring")

def failure_exception(self):
raise Exception("Fake KVSender Exception")
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/disaggregation/mooncake/conn.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ def add_transfer_request(
self.request_status[bootstrap_room] = KVPoll.WaitingForInput

def check_status(self, bootstrap_room: int):
# TOOD: do we really need the poll()?
# TODO: do we really need the poll()?

return self.request_status[bootstrap_room]

Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/disaggregation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):


def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
# 1. The page is guaruanteed to be full except the last page.
# 1. The page is guaranteed to be full except the last page.
# 2. page index = kv_index // page_size
# The return vector is kv_indices[::page_size] // page_size
if page_size == 1: # shortcut
Expand Down
Loading
Loading