volcengine
diff --git a/‎.github/workflows/e2e_sglang_gsm8k.yml
+60 b/‎.github/workflows/e2e_sglang_gsm8k.yml
+60
diff --git a/‎.gitignore
+3-1 b/‎.gitignore
+3-1
diff --git a/‎docs/start/install.rst
+21-1 b/‎docs/start/install.rst
+21-1
diff --git a/‎pyproject.toml
+1 b/‎pyproject.toml
+1
diff --git a/‎requirements.txt
+1-1 b/‎requirements.txt
+1-1
diff --git a/‎tests/e2e/run_qwen_gsm8k_function_rm.sh
+4-4 b/‎tests/e2e/run_qwen_gsm8k_function_rm.sh
+4-4
diff --git a/‎tests/rollout/test_sglang_spmd.py
+210 b/‎tests/rollout/test_sglang_spmd.py
+210
diff --git a/‎verl/single_controller/ray/base.py
+1 b/‎verl/single_controller/ray/base.py
+1
@@ -0,0 +1,60 @@
+name: e2e_sglang_gsm8k
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+      - v0.2.x
+    paths:
+      - "**/*.py"
+      - .github/workflows/e2e_sglang_gsm8k.yml
+  pull_request:
+    branches:
+      - main
+      - v0.2.x
+    paths:
+      - "**/*.py"
+      - "verl/trainer/config/*.yaml"
+      - .github/workflows/e2e_sglang_gsm8k.yml
+      - "tests/e2e/*.sh"
+
+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+# Declare permissions just read content.
+permissions: 
+  contents: read
+
+jobs:
+  e2e_sglang_gsm8k:
+    runs-on: [self-hosted, l20-1]
+    timeout-minutes: 40 # Increase this timeout value as needed
+    env:
+      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
+      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
+      NO_PROXY: "localhost,127.0.0.1"
+      HF_HUB_ENABLE_HF_TRANSFER: 1
+    container:
+      image: ocss884/verl-sglang:ngc-th2.5.1-cu126-sglang0.4.3.post3
+      options: --gpus all --shm-size=10g
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+            fetch-depth: 0
+      - name: Install the current repository
+        run: |
+          pip3 install hf_transfer
+          pip3 install -e .[test,gpu,sglang] --no-deps
+      - name: Prepare gsm8k dataset
+        run: |
+          ray stop --force
+          python3 examples/data_preprocess/gsm8k.py
+      - name: Running gsm8k e2e training tests on 8 L20 GPUs with rmpad using function rm and save ckpt
+        run: |
+          ray stop --force
+          bash tests/e2e/run_qwen_gsm8k_function_rm.sh sglang
+
@@ -93,6 +93,7 @@ celerybeat-schedule
 
 # virtualenv
 venv/
+.venv/
 ENV/
 
 # Spyder project settings
@@ -122,4 +123,5 @@ tests/e2e/toy_examples/deepspeed/synchronous/output.txt
 
 # local logs
 logs
-log
+log
+outputs
@@ -10,7 +10,7 @@ Requirements
 verl supports various backends. Currently, the following configurations are available:
 
 - **FSDP** and **Megatron-LM** (optional) for training.
-- **vLLM** and **TGI** for rollout generation, **SGLang** support coming soon.
+- **SGLang**, **vLLM** and **TGI** for rollout generation.
 
 Training backends
 ------------------
@@ -19,6 +19,25 @@ We recommend using **FSDP** backend to investigate, research and prototype diffe
 
 For users who pursue better scalability, we recommend using **Megatron-LM** backend. Currently, we support Megatron-LM v0.4 [1]_. The guide for using Megatron-LM backend can be found in :doc:`Megatron-LM Workers<../workers/megatron_workers>`.
 
+Install verl-SGLang from scratch
+-------------------------------------
+
+**SGLang has largely support the rearch and inference workload at xAI. For verl-sglang installation, ignore the version conflicts reported by pip with vllm. And, SGLang support native API for RLHF, do not need to patch a single line of code.**
+
+The following steps are quick installation guide for verl-SGLang.
+
+.. code:: bash
+    # Create a virtual environment and use uv for quick installation
+    python3 -m venv ~/.python/verl-sglang && source ~/.python/verl-sglang/bin/activate
+    python3 -m pip install --upgrade pip && python3 -m pip install --upgrade uv
+
+    # Install verl-SGLang
+    git clone https://github.com/volcengine/verl verl-sglang && cd verl-sglang
+    python3 -m uv pip install .
+    
+    # Install the latest stable version of sglang with verl support, currently, the latest version is 0.4.3.post3
+    # For SGLang installation, you can also refer to https://docs.sglang.ai/start/install.html
+    python3 -m uv pip install "sglang[all]==0.4.3.post3" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
 
 Install from docker image
 -------------------------
@@ -73,6 +92,7 @@ Image and tag: ``whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-
         git clone -b core_v0.4.0_verl https://github.com/eric-haibin-lin/Megatron-LM
         export PYTHONPATH=$PYTHONPATH:$(pwd)/Megatron-LM
 
+
 Install from custom environment
 ---------------------------------
 
 
@@ -57,6 +57,7 @@ test = [
 ]
 prime = ["pyext"]
 gpu = ["liger-kernel", "flash-attn"]
+sglang = ["sglang[all]==0.4.3.post3"]
 
 # URLs
 [project.urls]
 
@@ -17,5 +17,5 @@ ray[default]
 tensordict<0.6
 torchdata
 transformers
-vllm<=0.6.3
+# vllm==0.6.3.post1
 wandb
@@ -1,5 +1,5 @@
 set -x
-
+ENGINE=${1:-vllm}
 export VLLM_ATTENTION_BACKEND=XFORMERS
 
 python3 -m verl.trainer.main_ppo \
@@ -17,7 +17,7 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
     actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
     actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
-    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.name=$ENGINE \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
@@ -36,5 +36,5 @@ python3 -m verl.trainer.main_ppo \
     trainer.n_gpus_per_node=8 \
     trainer.nnodes=1 \
     trainer.save_freq=1 \
-    trainer.default_local_dir=$HOME/ckpt/ \
-    trainer.total_training_steps=1 $@
+    trainer.default_local_dir=$HOME/$ENGINE/ckpt/ \
+    trainer.total_training_steps=1
@@ -0,0 +1,210 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import torch
+from torch.distributed.device_mesh import init_device_mesh
+
+from sglang.srt.entrypoints.verl_engine import VerlEngine
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import GenerationConfig
+
+from verl.utils.torch_functional import pad_sequence_to_length
+
+
+def levenshtein(s1, s2):
+    m, n = len(s1), len(s2)
+    # Initialize matrix of zeros
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    # Initialize first column and first row of the matrix
+    for i in range(m + 1):
+        dp[i][0] = i  # Deletion from s1 to empty string
+    for j in range(n + 1):
+        dp[0][j] = j  # Insertion to s1 from empty string
+    # Compute the Levenshtein distance matrix
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            cost = 0 if s1[i - 1] == s2[j - 1] else 1  # No cost if characters match
+            dp[i][j] = min(
+                dp[i - 1][j] + 1,  # Deletion
+                dp[i][j - 1] + 1,  # Insertion
+                dp[i - 1][j - 1] + cost  # Substitution
+            )
+    return dp[m][n]
+
+
+def are_lists_similar(a, b):
+    if len(a) != len(b):
+        print("The lists are of different lengths.")
+        return False
+
+    total_length = 0
+    total_diff = 0
+
+    for s1, s2 in zip(a, b):
+        max_len = max(len(s1), len(s2))
+        total_length += max_len
+        diff = levenshtein(s1, s2)
+        total_diff += diff
+        print(f"Comparing strings:\n{s1}\n{s2}\nDifference: {diff} characters\n")
+
+    percentage_difference = (total_diff / total_length) * 100
+    print(f"Total difference: {percentage_difference:.2f}%")
+
+    return percentage_difference <= 10
+
+
+def initialize_global_process_group(timeout_second=36000):
+    from datetime import timedelta
+
+    import torch.distributed
+
+    # NOTE MODIFIED should provide backend=None to have nccl+gloo
+    # torch.distributed.init_process_group('nccl', timeout=timedelta(seconds=timeout_second))
+    torch.distributed.init_process_group(timeout=timedelta(seconds=timeout_second))
+
+    local_rank = int(os.environ["LOCAL_RANK"])
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+
+    if torch.distributed.is_initialized():
+        torch.cuda.set_device(local_rank)
+    return local_rank, rank, world_size
+
+
+def test_sglang_spmd():
+    assert torch.cuda.device_count() >= 2, 'At least 2 GPUs is required to run tp+dp tests.'
+    initialize_global_process_group()
+    # fill rollout config
+    max_prompt_length = 16
+    max_response_length = 16
+
+    # Initialize model and token
+    local_cache_path = '~/.cache/verl/rlhf'
+    local_cache_path = os.path.expanduser(local_cache_path)
+    hdfs_path = 'Qwen/Qwen2-7B-Instruct'
+    from verl.utils.fs import copy_to_local
+    local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path)
+    tokenizer = AutoTokenizer.from_pretrained(local_model_path, padding_side='left')
+
+    preencode_prompts = [
+        "Who won the Champions League in 2019?",
+        "The founder of Apple is",
+        "What's your name",
+    ]
+    tokenizer.pad_token = tokenizer.eos_token
+    prompts = tokenizer(preencode_prompts, return_tensors='pt', padding=True)
+    input_ids = prompts['input_ids']
+    attention_mask = prompts['attention_mask']
+
+    input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True)
+    attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True)
+
+    actor_model = AutoModelForCausalLM.from_pretrained(local_model_path)
+    actor_model.to(torch.bfloat16)
+
+    sampling_params = dict(n=1,
+                           temperature=0,
+                           top_p=1,
+                           top_k=-1,
+                           max_new_tokens=max_response_length,
+                           presence_penalty=0.0,
+                           frequency_penalty=0.0,
+                           repetition_penalty=1.0,
+                           skip_special_tokens=True,
+                           spaces_between_special_tokens=True,
+                           ignore_eos=False)
+
+    tensor_parallel_size = 4
+    device_mesh_kwargs = dict(mesh_shape=(1, tensor_parallel_size, 1), mesh_dim_names=["dp", "tp", "pp"])
+    inference_device_mesh_cpu = init_device_mesh("cpu", **device_mesh_kwargs)
+
+    for k in ["TORCHELASTIC_USE_AGENT_STORE"]:
+        if k in os.environ:
+            del os.environ[k]
+    print('building sglang rollout engine')
+    llm = VerlEngine(model_path=local_model_path,
+                     dtype="bfloat16",
+                     mem_fraction_static=0.5,
+                     device_mesh_cpu=inference_device_mesh_cpu["tp"],
+                     base_gpu_id=0,
+                     gpu_id_step=1)
+
+    llm.release_memory_occupation()
+    print("start generation")
+    input_ids = input_ids.cuda()
+    attention_mask = attention_mask.cuda()
+    batch_size = input_ids.size(0)
+
+    generation_config = GenerationConfig(do_sample=False)
+    actor_model.cuda()
+    output = actor_model.generate(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        max_new_tokens=max_response_length,
+        # max_length=max_length,
+        eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.pad_token_id,
+        generation_config=generation_config,
+        # renormalize_logits=True,
+        output_scores=False,  # this is potentially very large
+        return_dict_in_generate=True,
+        use_cache=False)  # may OOM when use_cache = True
+    seq = output.sequences
+    response = seq[:, max_prompt_length:]
+
+    hf_response_tokens = tokenizer.batch_decode(response)
+    print(f"hf response: {hf_response_tokens}")
+    print(f"{sampling_params=}")
+    idx_list = []
+    batch_size = input_ids.shape[0]
+
+    pad_token_id = (tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id)
+    for i in range(batch_size):
+        idx_list.append(_pre_process_inputs(pad_token_id, input_ids[i]))
+
+    outputs = llm.generate(input_ids=idx_list, sampling_params=sampling_params)
+    sglang_response_tokens = []
+
+    for output in outputs:
+        print(f"{output=}")
+        generated_text = output["text"]
+        sglang_response_tokens.append(generated_text)
+
+    print(f"sglang response: {sglang_response_tokens}")
+    assert are_lists_similar(hf_response_tokens, sglang_response_tokens), \
+        f"Strings differ more than 10%:\n"
+    print("Check Pass")
+
+
+def _pre_process_inputs(pad_token_id, prompt_token_ids: torch.Tensor):
+    # remove the left padding in the prompt token_id
+    # pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
+    non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
+    token_ids = prompt_token_ids[non_pad_index:].tolist()
+    return token_ids
@@ -465,6 +465,7 @@ def __init__(self):
             for key, user_defined_cls in cls_dict.items():
                 user_defined_cls = _unwrap_ray_remote(user_defined_cls)
                 # directly instantiate the class without remote
+                # in worker class, e.g. <verl.single_controller.base.worker.Worker> when DISABLE_WORKER_INIT == 1 it will return immediately
                 with patch.dict(os.environ, {'DISABLE_WORKER_INIT': '1'}):
                     self.worker_dict[key] = user_defined_cls(*init_args_dict[key].get('args', ()),
                                                              **init_args_dict[key].get('kwargs', {}))
Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,7 @@ test = [`
`57`	`57`	`]`
`58`	`58`	`prime = ["pyext"]`
`59`	`59`	`gpu = ["liger-kernel", "flash-attn"]`
	`60`	`+sglang = ["sglang[all]==0.4.3.post3"]`
`60`	`61`
`61`	`62`	`# URLs`
`62`	`63`	`[project.urls]`