diff --git a/recipe/gspo/test_gspo_3b_math.sh b/examples/gspo_trainer/test_gspo_3b_math.sh similarity index 100% rename from recipe/gspo/test_gspo_3b_math.sh rename to examples/gspo_trainer/test_gspo_3b_math.sh diff --git a/recipe/gspo/test_gspo_3b_math_slurm.sh b/examples/gspo_trainer/test_gspo_3b_math_slurm.sh similarity index 100% rename from recipe/gspo/test_gspo_3b_math_slurm.sh rename to examples/gspo_trainer/test_gspo_3b_math_slurm.sh diff --git a/recipe/gspo/test_gspo_qwen30b_a3b_ep.sh b/examples/gspo_trainer/test_gspo_qwen30b_a3b_ep.sh similarity index 100% rename from recipe/gspo/test_gspo_qwen30b_a3b_ep.sh rename to examples/gspo_trainer/test_gspo_qwen30b_a3b_ep.sh diff --git a/recipe/char_count/README.md b/recipe/char_count/README.md deleted file mode 100644 index 18f902d15eb..00000000000 --- a/recipe/char_count/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# Char Count -## Introduction -Char count is a simple NLP task. We create it for beginners to grasp the idea of RLVR. The task can be trained using a tiny model (e.g., https://huggingface.co/HuggingFaceTB/SmolLM2-135M) on a consumer GPU with only 8GB. - -## Problem formulation -The prompt is: "How many {char} are there in {word}?". In order for LLM to better answer this question, we create SFT dataset with intermediate steps. For example, - -```text -Question: How many n are there in n-i-n-e? -Answer: -n = n -i != n -n = n -e != n -\boxed{2} -``` - -Note that -- We add a dash between each individual char to make the task easier because each individual char will be tokenized to the same token by most tokenizer. -- In the SFT dataset, we create a CoT by listing all the individual chars and whether it equals to the target. In the end, it outputs the final answer inside the box. -- The task can be verified. -- The word is not always meaningful. Each char is sampled uniformly from a to z. We make the total length and the answer uniformly distributed within a range. - -## Scripts -To create the dataset, run -```bash -python3 create_dataset.py -``` -We create a train set and a val set. Both of them are used of SFT and RL. You can specify the total number of data, min/max length and data path. - -To run the SFT -```bash -bash train_sft.sh -``` -We train SFT for 3 epochs. After 3 epochs, the validation score is around 0.12. - -To run GRPO -```bash -bash train_grpo.sh -``` -We train GRPO for 2 epochs. After 2 epochs, the validation score is around 0.36. diff --git a/recipe/char_count/create_dataset.py b/recipe/char_count/create_dataset.py deleted file mode 100644 index 985b1f03b90..00000000000 --- a/recipe/char_count/create_dataset.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright 2024 Bytedance Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Task description: -Given a random word and a random char, count the number of occurrence of char in the word. - -Create CoT dataset that split the word into separate char. Then list the char and count the occurrence. - -The word set comes from shakespeare -""" - -import os.path -import random - -prompt_template = "How many {} are there in word {}?" - - -def generate_random_char(): - return chr(97 + random.randint(0, 25)) - - -def create_prompt_response(min_length=3, max_length=5): - # randomly generate a length - word_length = random.randint(min_length, max_length) - # randomly generate a target count number. This makes the target number - target_count_number = random.randint(1, word_length) - - char_lst = [] - # generate the word - # step 1: generate the target word - target_char = generate_random_char() - - for _ in range(target_count_number): - char_lst.append(target_char) - - # step 2: generate other words - for _ in range(word_length - target_count_number): - while True: - char = generate_random_char() - if char != target_char: - char_lst.append(char) - break - - # step 3: random permute char_lst - random.shuffle(char_lst) - - word = "-".join(char_lst) - - prompt = prompt_template.format(target_char, word) - final_answer = [] - - # cot - number = 0 - for i, char in enumerate(char_lst): - cot = f"{char}" - if char != target_char: - cot += " != " - else: - cot += " = " - number += 1 - cot += f"{target_char}." - - final_answer.append(cot) - - conclusion = f"\\boxed{{{number}}} {target_char} in {word}." - - final_answer.append(conclusion) - - final_answer = "\n".join(final_answer) - - return prompt, final_answer - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument("--total_number", type=int, default=10000) - parser.add_argument("--min_length", type=int, default=5) - parser.add_argument("--max_length", type=int, default=20) - parser.add_argument("--data_path", type=str, default="~/data/char_count") - - args = vars(parser.parse_args()) - - total_number = args["total_number"] - min_length = args["min_length"] - max_length = args["max_length"] - data_path = args["data_path"] - data_path = os.path.expanduser(data_path) - - full_output = [] - for _ in range(total_number): - output = create_prompt_response(min_length=min_length, max_length=max_length) - full_output.append(output) - - # random reorder - random.shuffle(full_output) - - # split for train and test - train_split_len = int(0.9 * len(full_output)) - train_outputs = full_output[:train_split_len] - test_output = full_output[train_split_len:] - - sft_train_dataset = {"prompt": [], "response": []} - - for o in train_outputs: - sft_train_dataset["prompt"].append(o[0]) - sft_train_dataset["response"].append(o[1]) - - sft_test_dataset = {"prompt": [], "response": []} - - for o in test_output: - sft_test_dataset["prompt"].append(o[0]) - sft_test_dataset["response"].append(o[1]) - - import pandas as pd - - sft_train_dataset = pd.DataFrame(data=sft_train_dataset) - sft_test_dataset = pd.DataFrame(data=sft_test_dataset) - - folder = os.path.join(data_path, "sft") - - os.makedirs(folder, exist_ok=True) - - sft_train_dataset.to_parquet(os.path.join(folder, "train.parquet")) - sft_test_dataset.to_parquet(os.path.join(folder, "test.parquet")) - - # build RL dataset - rl_train_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []} - - rl_test_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []} - - from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed - - for o in train_outputs: - prompt = o[0] - response = o[1] - prompt_with_template = [ - { - "role": "user", - "content": prompt, - } - ] - - rl_train_dataset["prompt"].append(prompt_with_template) - rl_train_dataset["data_source"].append("char_count") - rl_train_dataset["ability"].append("other") - rl_train_dataset["reward_model"].append( - {"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))} - ) - rl_train_dataset["extra_info"].append({"response": response}) - - for o in test_output: - prompt = o[0] - response = o[1] - prompt_with_template = [ - { - "role": "user", - "content": prompt, - } - ] - - rl_test_dataset["prompt"].append(prompt_with_template) - rl_test_dataset["data_source"].append("char_count") - rl_test_dataset["ability"].append("other") - rl_test_dataset["reward_model"].append( - {"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))} - ) - rl_test_dataset["extra_info"].append({"response": response}) - - rl_train_dataset = pd.DataFrame(data=rl_train_dataset) - rl_test_dataset = pd.DataFrame(data=rl_test_dataset) - - folder = os.path.join(data_path, "rl") - - os.makedirs(folder, exist_ok=True) - - rl_train_dataset.to_parquet(os.path.join(folder, "train.parquet")) - rl_test_dataset.to_parquet(os.path.join(folder, "test.parquet")) diff --git a/recipe/char_count/reward_function.py b/recipe/char_count/reward_function.py deleted file mode 100644 index 7c87ea49a1b..00000000000 --- a/recipe/char_count/reward_function.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright 2024 Bytedance Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Reward function -""" - -from verl.utils.reward_score import math_reward - - -def char_count_reward_function(data_source, solution_str, ground_truth, extra_info=None): - try: - last_boxed_string = math_reward.last_boxed_only_string(solution_str) - if last_boxed_string is None: - return 0 - solution = math_reward.remove_boxed(last_boxed_string) - if solution == ground_truth: - return 1 - else: - return 0 - except Exception: - print(ground_truth, solution_str) - return 0 diff --git a/recipe/char_count/train_grpo.sh b/recipe/char_count/train_grpo.sh deleted file mode 100644 index 5de85422fc4..00000000000 --- a/recipe/char_count/train_grpo.sh +++ /dev/null @@ -1,43 +0,0 @@ -set -x - - -python3 -m verl.trainer.main_ppo \ - algorithm.adv_estimator=grpo \ - data.train_files=$HOME/data/char_count/rl/train.parquet \ - data.val_files=$HOME/data/char_count/rl/test.parquet \ - data.train_batch_size=128 \ - data.max_prompt_length=128 \ - data.max_response_length=128 \ - data.filter_overlong_prompts=False \ - data.truncation='error' \ - actor_rollout_ref.model.path=./models/sft/global_step_105 \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.actor.ppo_mini_batch_size=16 \ - actor_rollout_ref.actor.use_dynamic_bsz=True \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=5000 \ - actor_rollout_ref.actor.use_kl_loss=False \ - actor_rollout_ref.actor.kl_loss_coef=0.0 \ - actor_rollout_ref.actor.kl_loss_type=low_var_kl \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ - actor_rollout_ref.actor.fsdp_config.param_offload=True \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ - actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ - actor_rollout_ref.rollout.name=vllm \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ - actor_rollout_ref.rollout.n=8 \ - actor_rollout_ref.ref.fsdp_config.param_offload=True \ - algorithm.use_kl_in_reward=False \ - trainer.critic_warmup=0 \ - trainer.logger='["console","tensorboard"]' \ - trainer.project_name='verl_example' \ - trainer.experiment_name='smol135m_grpo' \ - trainer.val_before_train=True \ - trainer.n_gpus_per_node=1 \ - trainer.nnodes=1 \ - trainer.save_freq=-1 \ - trainer.test_freq=5 \ - trainer.total_epochs=2 \ - custom_reward_function.path=recipe/char_count/reward_function.py \ - custom_reward_function.name=char_count_reward_function diff --git a/recipe/char_count/train_sft.sh b/recipe/char_count/train_sft.sh deleted file mode 100644 index 56f5cec5316..00000000000 --- a/recipe/char_count/train_sft.sh +++ /dev/null @@ -1,21 +0,0 @@ -set -x - -nproc_per_node=1 -save_path=./models/sft - -torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ - -m verl.trainer.fsdp_sft_trainer \ - data.train_files=$HOME/data/char_count/sft/train.parquet \ - data.val_files=$HOME/data/char_count/sft/test.parquet \ - data.prompt_key=prompt \ - data.response_key=response \ - data.micro_batch_size_per_gpu=8 \ - data.max_length=256 \ - data.train_batch_size=256 \ - use_remove_padding=True \ - model.partial_pretrain=HuggingFaceTB/SmolLM2-135M-Instruct \ - trainer.default_local_dir=$save_path \ - trainer.project_name=char_count-sft \ - trainer.experiment_name=char_count-sft-SmolLM2-135M-Instruct \ - trainer.total_epochs=3 \ - trainer.logger=console \ No newline at end of file