sgl-project
diff --git a/‎docker/Dockerfile‎
Lines changed: 5 additions & 2 deletions b/‎docker/Dockerfile‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎docs/references/advanced_deploy.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/references/advanced_deploy.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/references/disaggregation/lws-examples/d-svc.yaml‎
Lines changed: 12 additions & 0 deletions b/‎docs/references/disaggregation/lws-examples/d-svc.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/references/disaggregation/lws-examples/d.yaml‎
Lines changed: 292 additions & 0 deletions b/‎docs/references/disaggregation/lws-examples/d.yaml‎
Lines changed: 292 additions & 0 deletions
diff --git a/‎docs/references/disaggregation/lws-examples/lb.yaml‎
Lines changed: 55 additions & 0 deletions b/‎docs/references/disaggregation/lws-examples/lb.yaml‎
Lines changed: 55 additions & 0 deletions
@@ -58,7 +58,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
  && python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
  && if [ "$CUDA_VERSION" = "12.8.1" ]; then \
       python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.3 --force-reinstall --no-deps ; \
-      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.0/sgl_kernel-0.2.0+cu128-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
+      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.1/sgl_kernel-0.2.1+cu128-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
     fi
 
 # Build and install NVSHMEM + DeepEP
@@ -143,7 +143,10 @@ RUN python3 -m pip install --no-cache-dir --break-system-packages \
     icdiff \
     scikit_build_core \
     uv \
-    pre-commit
+    pre-commit \
+    pandas \
+    matplotlib \
+    tabulate
 
 # Install diff-so-fancy
 RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
 
@@ -5,3 +5,4 @@ Multi-Node Deployment
 
    multi_node.md
    deploy_on_k8s.md
+   disaggregation/lws_pd_deploy.md
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-decode-main
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
@@ -0,0 +1,292 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-decode-main
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --port
+          - "30000"
+          - --host
+          - "0.0.0.0"
+          - --model-path
+          - /work/models
+          - --chunked-prefill-size
+          - "262144"
+          - --page-size
+          - "64"
+          - --enable-dp-attention
+          - --enable-dp-lm-head
+          - --dp-size
+          - "16"
+          - --enable-deepep-moe
+          - --deepep-mode
+          - low_latency
+          - --disaggregation-mode
+          - decode
+          - --mem-fraction-static
+          - "0.849"
+          - --context-length
+          - "32768"
+          - --disaggregation-ib-device
+          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+          - --cuda-graph-max-bs
+          - "64"
+          - --max-running-requests
+          - "2048"
+          - --tp-size
+          - "16" # Size of Tensor Parallelism
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          env:
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:latest
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+        # should modify according your deployment env
+          pd: "yes"
+        tolerations:
+        # should modify according your deployment env
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - hostPath:
+            path: /data1/sgl_cache1
+            type: DirectoryOrCreate
+          name: sgl-cache
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+    restartPolicy: RecreateGroupOnPodRestart
+    size:  2
+    workerTemplate:
+      metadata: {}
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --model-path
+          - /work/models
+          - --chunked-prefill-size
+          - "262144"
+          - --page-size
+          - "64"
+          - --enable-dp-attention
+          - --enable-dp-lm-head
+          - --dp-size
+          - "16"
+          - --enable-deepep-moe
+          - --deepep-mode
+          - low_latency
+          - --disaggregation-mode
+          - decode
+          - --mem-fraction-static
+          - "0.849"
+          - --context-length
+          - "32768"
+          - --disaggregation-ib-device
+          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+          - --cuda-graph-max-bs
+          - "64"
+          - --max-running-requests
+          - "2048"
+          - --tp-size
+          - "16" # Size of Tensor Parallelism
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          env:
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:latest
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+        # should modify according your deployment env
+          pd: "yes"
+        tolerations:
+        # should modify according your deployment env
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - hostPath:
+            path: /data1/sgl_cache1
+            type: DirectoryOrCreate
+          name: sgl-cache
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+  networkConfig:
+    subdomainPolicy: Shared
+  replicas: 1
+  rolloutStrategy:
+    rollingUpdateConfiguration:
+      maxSurge: 0
+      maxUnavailable: 1
+    type: RollingUpdate
+  startupPolicy: LeaderCreated
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: deepseekr10528-lb-main
+  labels:
+    app: deepseekr10528-lb
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: deepseekr10528-lb
+  template:
+    metadata:
+      labels:
+        app: deepseekr10528-lb
+    spec:
+      nodeSelector:
+          bo: "yes"
+      tolerations:
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+      containers:
+        - name: sgl-minilb
+          image: lmsysorg/sglang:latest
+          command:
+          - python
+          - -m
+          - sglang.srt.disaggregation.mini_lb
+          - --prefill
+          - http://deepseekr10528-prefill-main:30000
+          - --decode
+          - http://deepseekr10528-decode-main:30000
+          - --host
+          - 0.0.0.0
+          - --port
+          -  "8000"
+          ports:
+            - containerPort: 8000
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-lb-service
+spec:
+  type: NodePort # NodePort is easy to test, you can also specify `ClusterIP`
+  selector:
+    app: deepseekr10528-lb
+  ports:
+    - protocol: TCP
+      port: 8000         # Service Port（In-Cluster）
+      targetPort: 8000   # Exposed Container
+      nodePort: 30800
Original file line number	Diff line number	Diff line change
`@@ -5,3 +5,4 @@ Multi-Node Deployment`
`5`	`5`
`6`	`6`	`multi_node.md`
`7`	`7`	`deploy_on_k8s.md`
	`8`	`+ disaggregation/lws_pd_deploy.md`