Skip to content

Commit 96e733c

Browse files
Merge branch 'main' into xiezhq-hicache-upstream
2 parents 4b816da + f9eb04d commit 96e733c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+2254
-510
lines changed

docker/Dockerfile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
5858
&& python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
5959
&& if [ "$CUDA_VERSION" = "12.8.1" ]; then \
6060
python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.3 --force-reinstall --no-deps ; \
61-
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.0/sgl_kernel-0.2.0+cu128-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
61+
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.1/sgl_kernel-0.2.1+cu128-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
6262
fi
6363

6464
# Build and install NVSHMEM + DeepEP
@@ -143,7 +143,10 @@ RUN python3 -m pip install --no-cache-dir --break-system-packages \
143143
icdiff \
144144
scikit_build_core \
145145
uv \
146-
pre-commit
146+
pre-commit \
147+
pandas \
148+
matplotlib \
149+
tabulate
147150

148151
# Install diff-so-fancy
149152
RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \

docs/references/advanced_deploy.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ Multi-Node Deployment
55

66
multi_node.md
77
deploy_on_k8s.md
8+
disaggregation/lws_pd_deploy.md
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: deepseekr10528-decode-main
5+
spec:
6+
selector:
7+
leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
8+
role: leader
9+
ports:
10+
- protocol: TCP
11+
port: 30000
12+
targetPort: 30000
Lines changed: 292 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,292 @@
1+
apiVersion: leaderworkerset.x-k8s.io/v1
2+
kind: LeaderWorkerSet
3+
metadata:
4+
name: deepseekr10528-decode-main
5+
spec:
6+
leaderWorkerTemplate:
7+
leaderTemplate:
8+
metadata:
9+
labels:
10+
role: leader
11+
spec:
12+
containers:
13+
- command:
14+
- python3
15+
- -m
16+
- sglang.launch_server
17+
- --port
18+
- "30000"
19+
- --host
20+
- "0.0.0.0"
21+
- --model-path
22+
- /work/models
23+
- --chunked-prefill-size
24+
- "262144"
25+
- --page-size
26+
- "64"
27+
- --enable-dp-attention
28+
- --enable-dp-lm-head
29+
- --dp-size
30+
- "16"
31+
- --enable-deepep-moe
32+
- --deepep-mode
33+
- low_latency
34+
- --disaggregation-mode
35+
- decode
36+
- --mem-fraction-static
37+
- "0.849"
38+
- --context-length
39+
- "32768"
40+
- --disaggregation-ib-device
41+
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
42+
- --cuda-graph-max-bs
43+
- "64"
44+
- --max-running-requests
45+
- "2048"
46+
- --tp-size
47+
- "16" # Size of Tensor Parallelism
48+
- --dist-init-addr
49+
- $(LWS_LEADER_ADDRESS):20102
50+
- --nnodes
51+
- $(LWS_GROUP_SIZE)
52+
- --node-rank
53+
- $(LWS_WORKER_INDEX)
54+
- --trust-remote-code
55+
- --ep-num-redundant-experts
56+
- "32"
57+
- --moe-dense-tp-size
58+
- "1"
59+
env:
60+
- name: CUDA_LAUNCH_BLOCKING
61+
value: "0"
62+
- name: NVSHMEM_IB_GID_INDEX
63+
value: "3"
64+
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
65+
value: "1"
66+
- name: NVSHMEM_HCA_PE_MAPPING
67+
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
68+
- name: NCCL_IB_QPS_PER_CONNECTION
69+
value: "8"
70+
- name: NCCL_IB_SPLIT_DATA_ON_QPS
71+
value: "1"
72+
- name: NCCL_NET_PLUGIN
73+
value: "none"
74+
- name: NCCL_IB_TC
75+
value: "136"
76+
- name: NCCL_MIN_NCHANNELS
77+
value: "4"
78+
- name: NCCL_IB_SL
79+
value: "5"
80+
- name: MC_TE_METRIC
81+
value: "true"
82+
- name: SGLANG_MOONCAKE_TRANS_THREAD
83+
value: "16"
84+
- name: SGL_ENABLE_JIT_DEEPGEMM
85+
value: "1"
86+
- name: NCCL_IB_HCA
87+
value: ^=mlx5_0,mlx5_5,mlx5_6
88+
- name: LWS_WORKER_INDEX
89+
valueFrom:
90+
fieldRef:
91+
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
92+
image: lmsysorg/sglang:latest
93+
name: sglang-leader
94+
ports:
95+
- containerPort: 30000
96+
protocol: TCP
97+
readinessProbe:
98+
periodSeconds: 30
99+
tcpSocket:
100+
port: 30000
101+
resources:
102+
limits:
103+
nvidia.com/gpu: "8"
104+
securityContext:
105+
capabilities:
106+
add:
107+
- IPC_LOCK
108+
privileged: true
109+
volumeMounts:
110+
- mountPath: /root/.cache
111+
name: sgl-cache
112+
- mountPath: /dev/shm
113+
name: dshm
114+
- mountPath: /work/models
115+
name: model
116+
- mountPath: /dev/infiniband
117+
name: ib
118+
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
119+
name: cf
120+
dnsPolicy: ClusterFirstWithHostNet
121+
hostIPC: true
122+
hostNetwork: true
123+
nodeSelector:
124+
# should modify according your deployment env
125+
pd: "yes"
126+
tolerations:
127+
# should modify according your deployment env
128+
- key: bopd
129+
operator: Exists
130+
- key: node-role
131+
operator: Exists
132+
volumes:
133+
- hostPath:
134+
path: /data1/sgl_cache1
135+
type: DirectoryOrCreate
136+
name: sgl-cache
137+
- emptyDir:
138+
medium: Memory
139+
name: dshm
140+
- hostPath:
141+
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
142+
name: model
143+
- hostPath:
144+
path: /dev/infiniband
145+
name: ib
146+
- hostPath:
147+
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
148+
name: cf
149+
restartPolicy: RecreateGroupOnPodRestart
150+
size: 2
151+
workerTemplate:
152+
metadata: {}
153+
spec:
154+
containers:
155+
- command:
156+
- python3
157+
- -m
158+
- sglang.launch_server
159+
- --model-path
160+
- /work/models
161+
- --chunked-prefill-size
162+
- "262144"
163+
- --page-size
164+
- "64"
165+
- --enable-dp-attention
166+
- --enable-dp-lm-head
167+
- --dp-size
168+
- "16"
169+
- --enable-deepep-moe
170+
- --deepep-mode
171+
- low_latency
172+
- --disaggregation-mode
173+
- decode
174+
- --mem-fraction-static
175+
- "0.849"
176+
- --context-length
177+
- "32768"
178+
- --disaggregation-ib-device
179+
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
180+
- --cuda-graph-max-bs
181+
- "64"
182+
- --max-running-requests
183+
- "2048"
184+
- --tp-size
185+
- "16" # Size of Tensor Parallelism
186+
- --dist-init-addr
187+
- $(LWS_LEADER_ADDRESS):20102
188+
- --nnodes
189+
- $(LWS_GROUP_SIZE)
190+
- --node-rank
191+
- $(LWS_WORKER_INDEX)
192+
- --trust-remote-code
193+
- --ep-num-redundant-experts
194+
- "32"
195+
- --moe-dense-tp-size
196+
- "1"
197+
env:
198+
- name: NVSHMEM_IB_TRAFFIC_CLASS
199+
value: "16"
200+
- name: NVSHMEM_IB_GID_INDEX
201+
value: "3"
202+
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
203+
value: "1"
204+
- name: NVSHMEM_HCA_PE_MAPPING
205+
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
206+
- name: NCCL_IB_QPS_PER_CONNECTION
207+
value: "8"
208+
- name: NCCL_IB_SPLIT_DATA_ON_QPS
209+
value: "1"
210+
- name: NCCL_NET_PLUGIN
211+
value: "none"
212+
- name: NCCL_IB_TC
213+
value: "136"
214+
- name: NCCL_MIN_NCHANNELS
215+
value: "4"
216+
- name: MC_TE_METRIC
217+
value: "true"
218+
- name: NCCL_IB_SL
219+
value: "5"
220+
- name: SGLANG_MOONCAKE_TRANS_THREAD
221+
value: "16"
222+
- name: SGL_ENABLE_JIT_DEEPGEMM
223+
value: "1"
224+
- name: NCCL_IB_HCA
225+
value: ^=mlx5_0,mlx5_5,mlx5_6
226+
- name: LWS_WORKER_INDEX
227+
valueFrom:
228+
fieldRef:
229+
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
230+
image: lmsysorg/sglang:latest
231+
name: sglang-worker
232+
ports:
233+
- containerPort: 30001
234+
resources:
235+
limits:
236+
nvidia.com/gpu: "8"
237+
securityContext:
238+
capabilities:
239+
add:
240+
- IPC_LOCK
241+
privileged: true
242+
volumeMounts:
243+
- mountPath: /root/.cache
244+
name: sgl-cache
245+
- mountPath: /dev/shm
246+
name: dshm
247+
- mountPath: /work/models
248+
name: model
249+
- mountPath: /dev/infiniband
250+
name: ib
251+
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
252+
name: cf
253+
dnsPolicy: ClusterFirstWithHostNet
254+
hostIPC: true
255+
hostNetwork: true
256+
nodeSelector:
257+
# should modify according your deployment env
258+
pd: "yes"
259+
tolerations:
260+
# should modify according your deployment env
261+
- key: bopd
262+
operator: Exists
263+
- key: node-role
264+
operator: Exists
265+
volumes:
266+
- hostPath:
267+
path: /data1/sgl_cache1
268+
type: DirectoryOrCreate
269+
name: sgl-cache
270+
- emptyDir:
271+
medium: Memory
272+
name: dshm
273+
- hostPath:
274+
path: /dev/infiniband
275+
name: ib
276+
- hostPath:
277+
# modify according to you deployment env
278+
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
279+
name: model
280+
- hostPath:
281+
# modify according to you deployment env
282+
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
283+
name: cf
284+
networkConfig:
285+
subdomainPolicy: Shared
286+
replicas: 1
287+
rolloutStrategy:
288+
rollingUpdateConfiguration:
289+
maxSurge: 0
290+
maxUnavailable: 1
291+
type: RollingUpdate
292+
startupPolicy: LeaderCreated
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: deepseekr10528-lb-main
5+
labels:
6+
app: deepseekr10528-lb
7+
spec:
8+
replicas: 1
9+
selector:
10+
matchLabels:
11+
app: deepseekr10528-lb
12+
template:
13+
metadata:
14+
labels:
15+
app: deepseekr10528-lb
16+
spec:
17+
nodeSelector:
18+
bo: "yes"
19+
tolerations:
20+
- key: bopd
21+
operator: Exists
22+
- key: node-role
23+
operator: Exists
24+
containers:
25+
- name: sgl-minilb
26+
image: lmsysorg/sglang:latest
27+
command:
28+
- python
29+
- -m
30+
- sglang.srt.disaggregation.mini_lb
31+
- --prefill
32+
- http://deepseekr10528-prefill-main:30000
33+
- --decode
34+
- http://deepseekr10528-decode-main:30000
35+
- --host
36+
- 0.0.0.0
37+
- --port
38+
- "8000"
39+
ports:
40+
- containerPort: 8000
41+
42+
---
43+
apiVersion: v1
44+
kind: Service
45+
metadata:
46+
name: deepseekr10528-lb-service
47+
spec:
48+
type: NodePort # NodePort is easy to test, you can also specify `ClusterIP`
49+
selector:
50+
app: deepseekr10528-lb
51+
ports:
52+
- protocol: TCP
53+
port: 8000 # Service Port(In-Cluster)
54+
targetPort: 8000 # Exposed Container
55+
nodePort: 30800

0 commit comments

Comments
 (0)