Skip to content

Commit

Permalink
delete notes and modify argument
Browse files Browse the repository at this point in the history
  • Loading branch information
greycooker committed Feb 24, 2025
1 parent 0eecab8 commit 8ac7fc1
Show file tree
Hide file tree
Showing 9 changed files with 255 additions and 58 deletions.
37 changes: 37 additions & 0 deletions llm/alignment/ppo/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json

import requests

CHAT_URL = "http://127.0.0.1:8731"

data = {
"src": [
"Natalia sold clips to 48 of her friends in April, ",
"Weng earns $12 an hour for babysitting. Yesterday",
],
"tgt": [
"Natalia sold 48/2 = 24 clips in May. #### 72",
"She earned 0.2 x 50 = $10. #### 10",
],
"response": [
"Natalia sold 48+24 = 72 clips altogether in April and May. #### 72",
"2",
],
}
res = requests.post(CHAT_URL, json=data)
result = json.loads(res.text)
print("result:", result, result["score"])
5 changes: 0 additions & 5 deletions llm/alignment/ppo/comm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,6 @@ def reload_tensor_to_gpu(tensors):
return

# optimizer
# print("hello1!!!")
# print(type(tensors))
# print(len(tensors))
# print(tensors[0])
if "optimizer" in tensors[1]:
optimizer = tensors[0]
# offload moment1
Expand Down Expand Up @@ -617,7 +613,6 @@ def export_evaluate_model(self: Trainer, train_model, eval_model, **kwargs):
ret = distributed_gather(tensor, dst=0, group=tp_group, offload=False)
action = tp_actions.pop(key)
tensor = action(ret) if is_dst else None
# if is_dst: print("="*20, "gather", key, [t.shape for t in ret], tensor.shape)
else:
tensor = tensor._copy_to(paddle.CPUPlace(), False) if is_dst else None

Expand Down
2 changes: 1 addition & 1 deletion llm/alignment/ppo/grpo_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"ptx_datasets": "alpaca",
"actor_model_name_or_path": "PKU-Alignment/alpaca-7b-reproduced",
"reward_model_name_or_path": "PKU-Alignment/alpaca-7b-reproduced",
"output_dir": "checkpoints/ernie-bot-ppo",
"output_dir": "checkpoints/ppo",
"use_fusemt": 1,
"use_flash_attention": 1,
"max_dec_len": 1024,
Expand Down
14 changes: 2 additions & 12 deletions llm/alignment/ppo/infer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,11 @@ def create_infer_model(model, dtype, set_state=False):
)
config = copy.deepcopy(model.config)
config.tensor_parallel_rank, config.tensor_parallel_degree = init_dist_env()

print("tensor parallel degree in create_infer_model", config.tensor_parallel_degree)
# ppo为啥加这一行?注释掉会报错
# config.quant_type = None
# ################### ppohacking
config.quant_type = []
config.cachekv_int8_type = None
config.append_attn = False
# breakpoint()
# ################### ppohacking
config.single_card_ptq = True

infer_model_cls = getattr(paddlenlp.experimental.transformers, model.__class__.__name__ + "InferenceModel")
# ori_init_weights = infer_model_cls.init_weights
# infer_model_cls.init_weights = lambda self: None
Expand Down Expand Up @@ -130,7 +124,7 @@ def _create_param(self, *args, **kwargs):
"repetition_penalty": trainer.args.repetition_penalty,
}
)[0]
print("trainer amp_dtype", trainer.amp_dtype)

policy_predictor = Predictor(predictor_args, model=infer_model, tokenizer=trainer.tokenizer)
return policy_predictor

Expand Down Expand Up @@ -174,8 +168,6 @@ def disable(self, model, onload_model=True):
self.is_available = False

def enable(self, model, offload_model=True):
print("tensor parallel degree in enable")
print(model.config.tensor_parallel_degree)
if self.is_available:
return
# set params
Expand All @@ -185,8 +177,6 @@ def enable(self, model, offload_model=True):
@paddle.no_grad()
def set_state_dict(self, model, offload_model=True):
key = list(model.state_dict().keys())[3]
print("model state dict dtype", model.state_dict()[key].dtype)
print(paddle.get_default_dtype())
self.model.set_state_dict(model.state_dict())
if offload_model:
offload_place = paddle.CUDAPinnedPlace()
Expand Down
12 changes: 1 addition & 11 deletions llm/alignment/ppo/ppo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1273,7 +1273,6 @@ def prediction_step(
src = self.tokenizer.batch_decode(inputs["input_ids"], skip_special_tokens=True)
tgt = self.tokenizer.batch_decode(inputs["label_ids"], skip_special_tokens=True)
response = self.tokenizer.batch_decode(generated_seq[:, prompt_len:], skip_special_tokens=True)
# pdb
reward_score = self.request_reward_server(src, tgt, response)

reward_score = reward_score.squeeze(axis=-1).cast(paddle.float32)
Expand Down Expand Up @@ -2611,7 +2610,6 @@ def rollout_reward_value(
reward_score = self.request_reward_server(src, tgt, response)

reward_score = reward_score.squeeze(axis=-1)
# reward_score的shape是[1],reward_score的shape是[]

if self.args.rl_algorithm == "grpo":
return {"rewards": reward_score}
Expand Down Expand Up @@ -2660,11 +2658,8 @@ def post():
reward_score = paddle.empty(shape=[len(response)], dtype=self._model_config.dtype)
paddle.distributed.barrier(tp_group)
paddle.distributed.broadcast(reward_score, src=tp_group.ranks[0], group=tp_group)
# reward_score =
# Tensor(shape=[1], dtype=bfloat16, place=Place(gpu:7), stop_gradient=True,
# [0.00000000])

return reward_score
return reward_score.unsqueeze(-1)

@paddle.no_grad()
def normalize_batch_data(
Expand Down Expand Up @@ -2741,10 +2736,6 @@ def normalize_batch_data(
sequence_mask = attention_mask[:, 1:].clone() # length: src + tgt -1
sequence_mask[:, :start] = False
if use_tgt_len_value:
# pdb
print("print log_probs!!!")
print(ref_log_probs.shape)
print(start)
ref_log_probs = ref_log_probs[:, start:].contiguous()
old_log_probs = old_log_probs[:, start:].contiguous()
if self.args.rl_algorithm == "ppo":
Expand Down Expand Up @@ -2858,7 +2849,6 @@ def compute_grpo_advantages(
id2mean = {}
id2std = {}
batch_size = rewards.shape[0]
# if rewards.ndim > 0 else 1

for i in range(batch_size):
id2score[index[i]].append(rewards[i])
Expand Down
1 change: 0 additions & 1 deletion llm/alignment/ppo/run_ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ def main():
runtime_timer.start("Actor model loading time")

# actor model
# pdb
actor_model_config = AutoConfig.from_pretrained(
model_args.actor_model_name_or_path,
tensor_parallel_output=training_args.tensor_parallel_output,
Expand Down
82 changes: 82 additions & 0 deletions llm/config/llama/grpo_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
{
"train_datasets": "PKU-SafeRLHF/train",
"eval_datasets": "PKU-SafeRLHF/test",
"ptx_datasets": "alpaca",
"actor_model_name_or_path": "PKU-Alignment/alpaca-7b-reproduced",
"reward_model_name_or_path": "PKU-Alignment/beaver-7b-v1.0-reward",
"output_dir": "checkpoints/llama-grpo",
"logging_dir": "log",
"max_length": 2048,
"use_fusemt": 1,
"use_flash_attention": 1,
"max_dec_len": 1024,
"min_dec_len": 1,
"top_p": 0.8,
"temperature": 1.0,
"num_return_sequences": 1,
"repetition_penalty": 1.0,
"num_train_epochs": 1,
"max_steps": 17,
"update_iters": 1,
"per_device_prompt_batch_size": 2,
"per_device_train_batch_size": 2,
"gradient_accumulation_steps": 1,
"learning_rate": 2e-6,
"min_learning_rate": 2e-7,
"weight_decay": 0.01,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.03,
"recompute": 1,
"recompute_granularity": "full",
"recompute_use_reentrant": 1,
"critic_learning_rate": 2e-6,
"critic_min_learning_rate": 2e-7,
"critic_weight_decay": 0.01,
"critic_lr_scheduler_type": "cosine",
"critic_warmup_ratio": 0.03,
"critic_recompute": 1,
"critic_recompute_granularity": "full",
"normalize_reward": 1,
"normalize_advantage": 1,
"kl_coeff": 0.02,
"clip_range_ratio": 0.2,
"clip_range_score": 10.0,
"clip_range_value": 5.0,
"ptx_coeff": 16.0,
"logging_steps": 1,
"logging_dir": "vdl_log",
"evaluation_strategy": "no",
"per_device_eval_batch_size": 16,
"eval_steps": 10000,
"save_strategy": "steps",
"save_steps": 400,
"save_total_limit": 5,
"bf16": 1,
"fp16": 0,
"fp16_opt_level": "O2",
"do_train": 1,
"do_eval": 0,
"disable_tqdm": 1,
"sharding_parallel_degree": 1,
"sharding": "stage1",
"tensor_parallel_degree": 8,
"tensor_parallel_output": 0,
"pipeline_parallel_degree": 1,
"pipeline_parallel_config": "disable_p2p_cache_shape",
"sequence_parallel": 0,
"max_grad_norm": 1.0,
"adam_beta1": 0.9,
"adam_beta2": 0.95,
"dataloader_drop_last": 0,
"eval_mode": "",
"offload_level": "freeze_model optimizer train_model",
"release_grads": 1,
"seed": 23,
"use_fused_head_and_loss_fn": 0,
"fused_linear":1,
"autotuner_benchmark": 0,
"skip_profile_timer": 1,
"use_rm_server": true,
"reward_server": "http://10.174.146.80:8048",
"rl_algorithm": "grpo"
}
78 changes: 50 additions & 28 deletions llm/config/llama/ppo_argument.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,54 +4,76 @@
"ptx_datasets": "alpaca",
"actor_model_name_or_path": "PKU-Alignment/alpaca-7b-reproduced",
"reward_model_name_or_path": "PKU-Alignment/beaver-7b-v1.0-reward",
"output_dir": "checkpoints/llm_ppo",
"max_length": 512,
"output_dir": "checkpoints/llama-ppo",
"logging_dir": "log",
"max_length": 2048,
"use_fusemt": 1,
"use_flash_attention": 1,
"max_dec_len": 1024,
"min_dec_len": 1,
"top_p": 0.8,
"temperature": 1.0,
"num_return_sequences":1,
"num_return_sequences": 1,
"repetition_penalty": 1.0,
"num_train_epochs": 1,
"max_steps": 17,
"update_iters": 1,
"per_device_prompt_batch_size": 16,
"per_device_train_batch_size": 16,
"per_device_prompt_batch_size": 2,
"per_device_train_batch_size": 2,
"gradient_accumulation_steps": 1,
"learning_rate": 1e-5,
"learning_rate": 2e-6,
"min_learning_rate": 2e-7,
"weight_decay": 0.01,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.03,
"recompute": true,
"critic_learning_rate": 5e-6,
"critic_weight_decay": 0.0,
"critic_lr_scheduler_type": "constant",
"recompute": 1,
"recompute_granularity": "full",
"recompute_use_reentrant": 1,
"critic_learning_rate": 2e-6,
"critic_min_learning_rate": 2e-7,
"critic_weight_decay": 0.01,
"critic_lr_scheduler_type": "cosine",
"critic_warmup_ratio": 0.03,
"critic_recompute": true,
"normalize_reward": false,
"critic_recompute": 1,
"critic_recompute_granularity": "full",
"normalize_reward": 1,
"normalize_advantage": 1,
"kl_coeff": 0.02,
"clip_range_ratio": 0.2,
"clip_range_score": 50.0,
"clip_range_score": 10.0,
"clip_range_value": 5.0,
"ptx_coeff": 16.0,
"per_device_eval_batch_size": 16,
"logging_steps": 1,
"evaluation_strategy": "steps",
"eval_steps": 100,
"save_strategy": "epoch",
"save_steps": 100000,
"bf16": true,
"logging_dir": "vdl_log",
"evaluation_strategy": "no",
"per_device_eval_batch_size": 16,
"eval_steps": 10000,
"save_strategy": "steps",
"save_steps": 400,
"save_total_limit": 5,
"bf16": 1,
"fp16": 0,
"fp16_opt_level": "O2",
"do_train": true,
"do_eval": true,
"disable_tqdm": true,
"save_total_limit": 1,
"sharding_parallel_degree": 4,
"do_train": 1,
"do_eval": 0,
"disable_tqdm": 1,
"sharding_parallel_degree": 1,
"sharding": "stage1",
"tensor_parallel_degree": 2,
"tensor_parallel_degree": 8,
"tensor_parallel_output": 0,
"pipeline_parallel_degree": 1,
"pipeline_parallel_config": "disable_p2p_cache_shape",
"max_grad_norm": 1.0,
"sequence_parallel": 0,
"max_grad_norm": 1.0,
"adam_beta1": 0.9,
"adam_beta2": 0.95,
"dataloader_drop_last": false,
"dataloader_drop_last": 0,
"eval_mode": "",
"offload_level": "freeze_model"
"offload_level": "freeze_model optimizer train_model",
"release_grads": 1,
"seed": 23,
"use_fused_head_and_loss_fn": 0,
"fused_linear":1,
"autotuner_benchmark": 0,
"skip_profile_timer": 1
}
Loading

0 comments on commit 8ac7fc1

Please sign in to comment.