Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,20 +115,24 @@ You can then invoke the function with `run` or `run_batch`.
The system will manage the state, chat template, and parallelism for you.

### Control Flow
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.

```python
@sgl.function
def control_flow(s, question):
s += "To answer this question: " + question + ", "
s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "

# You can use if or nested function calls
if s["tool"] == "calculator":
s += "The math expression is" + sgl.gen("expression")
elif s["tool"] == "web browser":
s += "The website url is" + sgl.gen("url")
```

### Parallelism
Use `fork` to launch parallel prompts.
Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.

```python
@sgl.function
def tip_suggestion(s):
Expand All @@ -137,7 +141,7 @@ def tip_suggestion(s):
"1. Balanced Diet. 2. Regular Exercise.\n\n"
)

forks = s.fork(2) # Launch parallel prompts
forks = s.fork(2)
for i, f in enumerate(forks):
f += f"Now, expand tip {i+1} into a paragraph:\n"
f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
Expand All @@ -148,6 +152,8 @@ def tip_suggestion(s):
```

### Multi Modality
Use `sgl.image` to pass an image as input.

```python
@sgl.function
def image_qa(s, image_file, question):
Expand All @@ -156,6 +162,8 @@ def image_qa(s, image_file, question):
```

### Constrained Decoding
Use `regex=` to specify a regular expression as a decoding constraint.

```python
@sgl.function
def regular_expression_gen(s):
Expand All @@ -168,6 +176,8 @@ def regular_expression_gen(s):
```

### Batching
Use `run_batch` to run a batch of requests with continuous batching.

```python
@sgl.function
def text_qa(s, question):
Expand All @@ -180,10 +190,13 @@ states = text_qa.run_batch(
{"question": "What is the capital of France?"},
{"question": "What is the capital of Japan?"},
],
progress_bar=True
)
```

### Streaming
Add `stream=True` to enable streaming.

```python
@sgl.function
def text_qa(s, question):
Expand All @@ -192,7 +205,9 @@ def text_qa(s, question):

states = text_qa.run(
question="What is the capital of France?",
temperature=0.1)
temperature=0.1,
stream=True
)

for out in state.text_iter():
print(out, end="", flush=True)
Expand Down
5 changes: 4 additions & 1 deletion examples/usage/readme_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def driver_batching():
{"question": "What is the capital of France?"},
{"question": "What is the capital of Japan?"},
],
progress_bar=True
)

for s in states:
Expand All @@ -63,7 +64,9 @@ def driver_batching():
def driver_stream():
state = text_qa.run(
question="What is the capital of France?",
temperature=0.1)
temperature=0.1,
stream=True
)

for out in state.text_iter():
print(out, end="", flush=True)
Expand Down
6 changes: 1 addition & 5 deletions python/sglang/lang/interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,11 +632,7 @@ def __del__(self):
self.stream_executor.end()

def __repr__(self) -> str:
msgs = self.messages()
ret = ""
for msg in msgs:
ret += msg["role"] + ":\n" + msg["content"] + "\n"
return ret
return f"ProgramState({self.text()})"


class ProgramStateGroup:
Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/layers/context_flashattention_nopad.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import triton.language as tl
from sglang.srt.utils import wrap_kernel_launcher


CUDA_CAPABILITY = torch.cuda.get_device_capability()


Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/layers/extend_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
from sglang.srt.utils import wrap_kernel_launcher


CUDA_CAPABILITY = torch.cuda.get_device_capability()


Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/managers/router/model_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import logging
import multiprocessing
import time
import warnings
from concurrent.futures import ThreadPoolExecutor
from enum import Enum, auto
from typing import Dict, List, Optional, Tuple, Union
import warnings

import numpy as np
import rpyc
Expand Down