Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ jobs:
if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
with:
RUNNER: self-hosted-azure
TIMEOUT: 15
TIMEOUT: 20
UNIT_TEST_SCRIPT: |
cd /opt/reinforcer
uv run --no-sync bash -x ./tests/run_unit.sh
Expand Down
2 changes: 2 additions & 0 deletions examples/configs/eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ generation:
top_k: -1 # disable
num_prompts_per_step: -1 # -1 means pass all prompts at once
model_name: "Qwen/Qwen2.5-Math-1.5B-Instruct"
stop_token_ids: null
stop_strings: null
vllm_cfg:
tensor_parallel_size: 1
gpu_memory_utilization: 0.9
Expand Down
4 changes: 3 additions & 1 deletion examples/configs/grpo_math_1B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ policy:
temperature: 1.0
top_p: 1.0
top_k: null
stop_token_ids: null
stop_strings: null
vllm_cfg:
tensor_parallel_size: 1
gpu_memory_utilization: 0.6
Expand All @@ -69,7 +71,7 @@ data:
prompt_file: "examples/prompts/cot.txt"
system_prompt_file: null
dataset_name: "OpenMathInstruct-2"

env:
math:
num_workers: 8
Expand Down
4 changes: 3 additions & 1 deletion examples/configs/grpo_math_8B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,13 @@ policy:
temperature: 1.0
top_p: 1.0
top_k: null
stop_token_ids: null
stop_strings: null
vllm_cfg:
tensor_parallel_size: 1
gpu_memory_utilization: 0.6
max_model_len: ${policy.max_total_sequence_length}

cluster:
gpus_per_node: 8
num_nodes: 1
11 changes: 6 additions & 5 deletions nemo_reinforcer/models/generation/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,17 +118,18 @@ def configure_generation_config(
"""Apply specific configurations to generation config."""
# tokenizer setting
config["pad_token_id"] = tokenizer.pad_token_id
# When https://github.com/NVIDIA/reinforcer/issues/57 is fixed, we should update stop_token_ids below.
config["stop_token_ids"] = [tokenizer.eos_token_id]
if config["stop_token_ids"] is None:
config["stop_token_ids"] = [tokenizer.eos_token_id]

# vllm setting
if config["backend"] == "vllm":
if is_eval:
# set load_format
config["vllm_cfg"]["load_format"] = "auto" if is_eval else "dummy"
# set skip_tokenizer_init
if is_eval or config["stop_strings"] is not None:
config["vllm_cfg"]["skip_tokenizer_init"] = False
config["vllm_cfg"]["load_format"] = "auto"
else:
config["vllm_cfg"]["skip_tokenizer_init"] = True
config["vllm_cfg"]["load_format"] = "dummy"

return config

Expand Down
12 changes: 7 additions & 5 deletions nemo_reinforcer/models/generation/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,13 @@ def generate(
sampling_params = self.SamplingParams(
temperature=self.cfg["temperature"] if not greedy else 0,
top_p=self.cfg["top_p"],
top_k=top_k
if not greedy
else 1, # we use a default of -1 if unset so that 'null'/None is a common disable value
# we use a default of -1 if unset so that 'null'/None is a common disable value
top_k=top_k if not greedy else 1,
max_tokens=self.cfg["max_new_tokens"],
logprobs=0, # Return logprobs for the generated tokens
stop=None,
stop_token_ids=self.cfg["stop_token_ids"],
stop=self.cfg["stop_strings"],
include_stop_str_in_output=True, # returning stop strings like hf
)

# Generate outputs
Expand Down Expand Up @@ -352,7 +352,9 @@ def generate_text(
top_p=self.cfg["top_p"],
top_k=top_k if not greedy else 1,
max_tokens=self.cfg["max_new_tokens"],
stop=self.cfg.get("stop_sequences", None),
stop_token_ids=self.cfg["stop_token_ids"],
stop=self.cfg["stop_strings"],
include_stop_str_in_output=True, # returning stop strings like hf
)

# Generate outputs
Expand Down
6 changes: 4 additions & 2 deletions nemo_reinforcer/models/policy/hf_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,8 +544,10 @@ def generate(
temperature=gen_cfg["temperature"],
top_p=gen_cfg["top_p"],
top_k=gen_cfg["top_k"],
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=gen_cfg["pad_token_id"],
eos_token_id=gen_cfg["stop_token_ids"],
stop_strings=gen_cfg["stop_strings"],
tokenizer=self.tokenizer, # needs for stop_strings
return_dict_in_generate=True,
output_scores=True,
synced_gpus=True,
Expand Down
127 changes: 93 additions & 34 deletions tests/unit/models/generation/test_vllm_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict
from nemo_reinforcer.models.generation.interfaces import configure_generation_config
from nemo_reinforcer.models.generation.vllm import VllmGeneration, VllmConfig
from nemo_reinforcer.models.policy import PolicyConfig


# Define basic vLLM test config
Expand All @@ -35,13 +36,38 @@
"temperature": 1.0,
"top_p": 1.0,
"top_k": None,
"stop_token_ids": None,
"stop_strings": None,
"vllm_cfg": {
"tensor_parallel_size": 1,
"gpu_memory_utilization": 0.3,
"max_model_len": 1024,
},
}

# Create HF-specific config with required parameters
basic_hf_test_config: PolicyConfig = {
"model_name": basic_vllm_test_config["model_name"],
"tokenizer_name": basic_vllm_test_config["tokenizer_name"],
# Required training parameters
"train_global_batch_size": 1,
"train_micro_batch_size": 1,
"learning_rate": 5e-6,
"logprob_batch_size": 1,
"max_new_tokens": 16,
"do_sample": False,
"precision": "float32",
"optimizer": {
"name": "torch.optim.AdamW",
"kwargs": {
"lr": 5e-6,
"weight_decay": 0.01,
"betas": [0.9, 0.999],
"eps": 1e-8,
},
},
}


@pytest.fixture(scope="module")
def cluster():
Expand Down Expand Up @@ -193,28 +219,8 @@ def test_vllm_generation_with_hf_training(cluster, tokenizer):
vllm_config = basic_vllm_test_config.copy()
vllm_config = configure_generation_config(vllm_config, tokenizer)

# Create HF-specific config with required parameters
hf_config = {
"model_name": basic_vllm_test_config["model_name"],
"tokenizer_name": basic_vllm_test_config["tokenizer_name"],
# Required training parameters
"train_global_batch_size": 4,
"train_micro_batch_size": 1,
"learning_rate": 5e-6,
"logprob_batch_size": 1,
"max_new_tokens": 16,
"do_sample": False,
"precision": "float32",
"optimizer": {
"name": "torch.optim.AdamW",
"kwargs": {
"lr": 5e-6,
"weight_decay": 0.01,
"betas": [0.9, 0.999],
"eps": 1e-8,
},
},
}
hf_config = basic_hf_test_config.copy()
hf_config["train_global_batch_size"] = 4

vllm_policy = None
hf_policy = None
Expand Down Expand Up @@ -498,18 +504,7 @@ def test_vllm_weight_update_and_prefix_cache_reset(
if tensor_parallel_size > 1:
vllm_config["vllm_kwargs"] = {"distributed_executor_backend": "ray"}

hf_config = {
"model_name": basic_vllm_test_config["model_name"],
"tokenizer_name": "meta-llama/Llama-3.2-1B",
"train_global_batch_size": 1,
"train_micro_batch_size": 1,
"learning_rate": 1e-6,
"logprob_batch_size": 1,
"max_new_tokens": 16,
"do_sample": False,
"precision": "float32",
"optimizer": {"name": "torch.optim.AdamW", "kwargs": {"lr": 1e-6}},
}
hf_config = basic_hf_test_config.copy()

# Create policies
vllm_policy = None
Expand Down Expand Up @@ -592,3 +587,67 @@ def test_vllm_weight_update_and_prefix_cache_reset(

gc.collect()
torch.cuda.empty_cache()


@pytest.mark.parametrize("is_eval", [True, False])
def test_vllm_generation_with_stop(cluster, test_input_data, tokenizer, is_eval):
"""Test vLLM generation with stop."""
from nemo_reinforcer.models.policy.hf_policy import HfPolicy

# Create separate configs for each policy
vllm_config = basic_vllm_test_config.copy()
vllm_config["stop_token_ids"] = [3363]
vllm_config["stop_strings"] = ["I am a"]
vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=is_eval)

# Ensure we can get same output
assert vllm_config["model_name"] == "meta-llama/Llama-3.2-1B", (
"Model name should be meta-llama/Llama-3.2-1B to get expected output"
)
assert vllm_config["vllm_cfg"]["tensor_parallel_size"] == 1, (
"Tensor parallel size should be 1 to get expected output"
)

# Create policies
print("Creating vLLM policy...")
vllm_generation = VllmGeneration(cluster, vllm_config)

# Get weights from HF policy if not in eval mode
if not is_eval:
# set to sleep first if not in eval mode
vllm_generation.finish_generation()

print("Creating HF policy...")
hf_config = basic_hf_test_config.copy()
hf_policy = HfPolicy(cluster, hf_config)

print(f"refitting vllm policy...")
ipc_handles = hf_policy.get_weights_ipc_handles()
vllm_generation.prepare_for_generation()
vllm_generation.update_weights(ipc_handles)

# test generate
outputs = vllm_generation.generate(test_input_data, greedy=True)
output_ids = outputs["output_ids"]
generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
assert generated_texts == [
"Hello, my name is Kelsey and I am a",
"The capital of France is Paris. The city",
], "Output should be the same as the expected output"

# test generate_text
test_prompts = [
"Hello, my name is",
"The capital of France is",
]
test_prompts = BatchedDataDict({"prompts": test_prompts})
output = vllm_generation.generate_text(test_prompts, greedy=True)
assert output["texts"] == [
" Kelsey and I am a",
" Paris. The city",
], "Output should be the same as the expected output"

# Clean up
vllm_generation.shutdown()
if not is_eval:
hf_policy.shutdown()
Loading