Skip to content

Commit d17af14

Browse files
Feat/add tp pp fc (#2768)
1 parent 688a508 commit d17af14

File tree

4 files changed

+567
-4
lines changed

4 files changed

+567
-4
lines changed

scripts/regression/test_dpo.py

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def test_dpo_full(self):
129129
self.dpotrain_tester.assert_result(training_p.returncode, training_p.stdout)
130130

131131
# test training loss
132-
EXCEPTED_LOSS = 0.474259
132+
EXCEPTED_LOSS = 0.474242
133133
self.dpotrain_tester.assert_loss(training_p.stdout, EXCEPTED_LOSS)
134134

135135
# test model generate
@@ -163,7 +163,7 @@ def test_dpo_lora(self):
163163
self.dpotrain_tester.assert_result(training_p.returncode, training_p.stdout)
164164

165165
# test training loss
166-
EXCEPTED_LOSS = 0.474163
166+
EXCEPTED_LOSS = 0.474235
167167
self.dpotrain_tester.assert_loss(training_p.stdout, EXCEPTED_LOSS)
168168

169169
# test lora merge
@@ -189,3 +189,87 @@ def test_dpo_lora(self):
189189
[[22407, 120525, 77505, 113631, 47887, 134141, 122487, 61092, 40897, 40601]]
190190
)
191191
self.dpotrain_tester.create_and_check_model_generate(lora_merge_output_dir, EXPECTED_RESULT)
192+
193+
def test_dpo_full_tp_pp(self):
194+
output_dir = os.path.join(OUTPUT_DIR, "dpo_full_tp_pp")
195+
update_args = {
196+
"model_name_or_path": MODEL_NAME_OR_PATH,
197+
"train_dataset_path": "./tests/fixtures/dummy/ernie/dpo-train.jsonl",
198+
"eval_dataset_path": "./tests/fixtures/dummy/ernie/dpo-train.jsonl",
199+
"output_dir": output_dir,
200+
}
201+
config_path = os.path.join(CONFIG_PATH, "full_tp_pp.yaml")
202+
updated_config_path = self.dpotrain_tester.update_training_args(config_path, output_dir, update_args)
203+
train_path = os.path.join(TRAIN_PATH, "alignment/dpo/run_dpo.py")
204+
cmd = [
205+
"python",
206+
"-u",
207+
"-m",
208+
"paddle.distributed.launch",
209+
"--devices",
210+
"0,1,2,3",
211+
train_path,
212+
updated_config_path,
213+
]
214+
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
215+
216+
# test training result
217+
self.dpotrain_tester.assert_result(training_p.returncode, training_p.stdout)
218+
219+
# test training loss
220+
EXCEPTED_LOSS = 0.495105
221+
self.dpotrain_tester.assert_loss(training_p.stdout, EXCEPTED_LOSS)
222+
223+
# test model generate
224+
EXPECTED_RESULT = paddle.to_tensor([[22407, 90612, 90612, 90612, 90612, 90612, 90612, 90612, 90612, 90612]])
225+
self.dpotrain_tester.create_and_check_model_generate(output_dir, EXPECTED_RESULT)
226+
227+
def test_dpo_lora_tp_pp(self):
228+
output_dir = os.path.join(OUTPUT_DIR, "dpo_lora_tp_pp")
229+
update_args = {
230+
"model_name_or_path": MODEL_NAME_OR_PATH,
231+
"train_dataset_path": "./tests/fixtures/dummy/ernie/dpo-train.jsonl",
232+
"eval_dataset_path": "./tests/fixtures/dummy/ernie/dpo-train.jsonl",
233+
"output_dir": output_dir,
234+
}
235+
config_path = os.path.join(CONFIG_PATH, "lora_tp_pp.yaml")
236+
updated_config_path = self.dpotrain_tester.update_training_args(config_path, output_dir, update_args)
237+
train_path = os.path.join(TRAIN_PATH, "alignment/dpo/run_dpo.py")
238+
cmd = [
239+
"python",
240+
"-u",
241+
train_path,
242+
updated_config_path,
243+
]
244+
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
245+
246+
# test training result
247+
self.dpotrain_tester.assert_result(training_p.returncode, training_p.stdout)
248+
249+
# test training loss
250+
EXCEPTED_LOSS = 0.495105
251+
self.dpotrain_tester.assert_loss(training_p.stdout, EXCEPTED_LOSS)
252+
253+
# test lora merge
254+
lora_merge_output_dir = os.path.join(output_dir, "lora_tp_pp_merge")
255+
lora_merge_path = os.path.join(TRAIN_PATH, "tools/mergekit.py")
256+
257+
lora_merge_cmd = [
258+
"python",
259+
"-u",
260+
lora_merge_path,
261+
"--lora_model_path",
262+
output_dir,
263+
"--model_name_or_path",
264+
MODEL_NAME_OR_PATH,
265+
"--output_path",
266+
lora_merge_output_dir,
267+
]
268+
lora_merge_p = subprocess.run(lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
269+
self.dpotrain_tester.assert_result(lora_merge_p.returncode, lora_merge_p.stdout)
270+
271+
# test lora_merge_model generate
272+
EXPECTED_RESULT = paddle.to_tensor(
273+
[[22407, 120525, 77505, 113631, 47887, 134141, 122487, 61092, 40897, 40601]]
274+
)
275+
self.dpotrain_tester.create_and_check_model_generate(lora_merge_output_dir, EXPECTED_RESULT)

scripts/regression/test_sft.py

Lines changed: 144 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ def assert_loss(self, output, base_loss):
6060
avg_loss = round(sum_loss, 6)
6161
else:
6262
avg_loss = 0
63+
print(f"Current loss : {avg_loss}")
64+
print(f"Base loss : {base_loss}")
6365
self.assertTrue(abs(avg_loss - base_loss) <= 0.0001, f"loss: {avg_loss}, base_loss: {base_loss}, exist diff!")
6466

6567
def assert_result(self, ret_code, log_output):
@@ -130,7 +132,7 @@ def test_sft_full(self):
130132
self.sfttrain_tester.assert_result(training_p.returncode, training_p.stdout)
131133

132134
# test training loss
133-
EXCEPTED_LOSS = 11.945683
135+
EXCEPTED_LOSS = 11.945673
134136
self.sfttrain_tester.assert_loss(training_p.stdout, EXCEPTED_LOSS)
135137

136138
# test model resume
@@ -179,7 +181,7 @@ def test_sft_lora(self):
179181
self.sfttrain_tester.assert_result(training_p.returncode, training_p.stdout)
180182

181183
# test training loss
182-
EXCEPTED_LOSS = 11.956834
184+
EXCEPTED_LOSS = 11.956829
183185
self.sfttrain_tester.assert_loss(training_p.stdout, EXCEPTED_LOSS)
184186

185187
# test lora merge
@@ -209,3 +211,143 @@ def test_sft_lora(self):
209211
[[22407, 120525, 77505, 113631, 47887, 134141, 122487, 61092, 40897, 40601]]
210212
)
211213
self.sfttrain_tester.create_and_check_model_generate(lora_merge_output_dir, EXPECTED_RESULT)
214+
215+
def test_sft_full_tp_pp(self):
216+
output_dir = os.path.join(OUTPUT_DIR, "sft_full_tp_pp")
217+
update_args = {
218+
"model_name_or_path": MODEL_NAME_OR_PATH,
219+
"train_dataset_path": "./tests/fixtures/dummy/ernie/sft-train.jsonl",
220+
"eval_dataset_path": "./tests/fixtures/dummy/ernie/sft-train.jsonl",
221+
"output_dir": output_dir,
222+
}
223+
config_path = os.path.join(CONFIG_PATH, "full_tp_pp.yaml")
224+
updated_config_path = self.sfttrain_tester.update_training_args(config_path, output_dir, update_args)
225+
train_path = os.path.join(TRAIN_PATH, "run_finetune.py")
226+
# n_gpus = paddle.device.cuda.device_count()
227+
# devices = ",".join(str(i) for i in range(n_gpus))
228+
cmd = [
229+
"python",
230+
"-u",
231+
"-m",
232+
"paddle.distributed.launch",
233+
"--devices",
234+
"0,1,2,3",
235+
train_path,
236+
updated_config_path,
237+
]
238+
print(f"cmd {cmd}")
239+
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
240+
241+
# test training result
242+
self.sfttrain_tester.assert_result(training_p.returncode, training_p.stdout)
243+
244+
# test training loss
245+
EXCEPTED_LOSS = 11.956903
246+
self.sfttrain_tester.assert_loss(training_p.stdout, EXCEPTED_LOSS)
247+
248+
# test model resume
249+
# EXCEPTED_LOSS = 9.550503
250+
# self.sfttrain_tester.assert_loss(reusme_p.stdout, EXCEPTED_LOSS)
251+
# test model generate
252+
EXPECTED_RESULT = paddle.to_tensor([[22407, 90612, 90612, 90612, 90612, 90612, 90612, 90612, 90612, 90612]])
253+
self.sfttrain_tester.create_and_check_model_generate(output_dir, EXPECTED_RESULT)
254+
255+
def test_sft_lora_tp_pp(self):
256+
output_dir = os.path.join(OUTPUT_DIR, "sft_lora_tp_pp")
257+
update_args = {
258+
"model_name_or_path": MODEL_NAME_OR_PATH,
259+
"train_dataset_path": "./tests/fixtures/dummy/ernie/sft-train.jsonl",
260+
"eval_dataset_path": "./tests/fixtures/dummy/ernie/sft-train.jsonl",
261+
"output_dir": output_dir,
262+
}
263+
config_path = os.path.join(CONFIG_PATH, "lora_tp_pp.yaml")
264+
updated_config_path = self.sfttrain_tester.update_training_args(config_path, output_dir, update_args)
265+
train_path = os.path.join(TRAIN_PATH, "run_finetune.py")
266+
cmd = [
267+
"python",
268+
"-u",
269+
"-m",
270+
"paddle.distributed.launch",
271+
"--devices",
272+
"0,1,2,3",
273+
train_path,
274+
updated_config_path,
275+
]
276+
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
277+
278+
# test training result
279+
self.sfttrain_tester.assert_result(training_p.returncode, training_p.stdout)
280+
281+
# test training loss
282+
EXCEPTED_LOSS = 11.956903
283+
self.sfttrain_tester.assert_loss(training_p.stdout, EXCEPTED_LOSS)
284+
285+
# test lora merge
286+
lora_merge_output_dir = os.path.join(output_dir, "lora_tp_pp_merge")
287+
lora_merge_path = os.path.join(TRAIN_PATH, "tools/mergekit.py")
288+
289+
lora_merge_cmd = [
290+
"python",
291+
"-u",
292+
"-m",
293+
"paddle.distributed.launch",
294+
"--devices",
295+
"0,1,2,3",
296+
lora_merge_path,
297+
"--lora_model_path",
298+
output_dir,
299+
"--model_name_or_path",
300+
MODEL_NAME_OR_PATH,
301+
"--output_path",
302+
lora_merge_output_dir,
303+
]
304+
lora_merge_p = subprocess.run(lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
305+
self.sfttrain_tester.assert_result(lora_merge_p.returncode, lora_merge_p.stdout)
306+
307+
# test lora_merge_model generate
308+
EXPECTED_RESULT = paddle.to_tensor(
309+
[[22407, 120525, 77505, 113631, 47887, 134141, 122487, 61092, 40897, 40601]]
310+
)
311+
self.sfttrain_tester.create_and_check_model_generate(lora_merge_output_dir, EXPECTED_RESULT)
312+
313+
def test_sft_full_function_call(self):
314+
output_dir = os.path.join(OUTPUT_DIR, "sft_full_function_call")
315+
update_args = {
316+
"model_name_or_path": MODEL_NAME_OR_PATH,
317+
"train_dataset_path": "./tests/fixtures/dummy/function-call/function-call-train.jsonl",
318+
"eval_dataset_path": "./tests/fixtures/dummy/function-call/function-call-eval.jsonl",
319+
"output_dir": output_dir,
320+
}
321+
config_path = os.path.join(CONFIG_PATH, "full_function_call.yaml")
322+
updated_config_path = self.sfttrain_tester.update_training_args(config_path, output_dir, update_args)
323+
train_path = os.path.join(TRAIN_PATH, "run_finetune.py")
324+
cmd = [
325+
"python",
326+
"-u",
327+
"-m",
328+
"paddle.distributed.launch",
329+
"--devices",
330+
"0,1,2,3",
331+
train_path,
332+
updated_config_path,
333+
]
334+
print(f"cmd {cmd}")
335+
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
336+
337+
# test training result
338+
self.sfttrain_tester.assert_result(training_p.returncode, training_p.stdout)
339+
340+
# test training loss
341+
EXCEPTED_LOSS = 11.769741
342+
self.sfttrain_tester.assert_loss(training_p.stdout, EXCEPTED_LOSS)
343+
344+
# test model resume
345+
# reusme_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
346+
# self.sfttrain_tester.assert_result(reusme_p.returncode, reusme_p.stdout)
347+
348+
# EXCEPTED_LOSS = 9.550503
349+
# self.sfttrain_tester.assert_loss(reusme_p.stdout, EXCEPTED_LOSS)
350+
351+
# test model generate
352+
EXPECTED_RESULT = paddle.to_tensor([[22407, 90612, 90612, 90612, 90612, 90612, 90612, 90612, 90612, 90612]])
353+
self.sfttrain_tester.create_and_check_model_generate(output_dir, EXPECTED_RESULT)

0 commit comments

Comments
 (0)