@@ -60,6 +60,8 @@ def assert_loss(self, output, base_loss):
6060 avg_loss = round (sum_loss , 6 )
6161 else :
6262 avg_loss = 0
63+ print (f"Current loss : { avg_loss } " )
64+ print (f"Base loss : { base_loss } " )
6365 self .assertTrue (abs (avg_loss - base_loss ) <= 0.0001 , f"loss: { avg_loss } , base_loss: { base_loss } , exist diff!" )
6466
6567 def assert_result (self , ret_code , log_output ):
@@ -130,7 +132,7 @@ def test_sft_full(self):
130132 self .sfttrain_tester .assert_result (training_p .returncode , training_p .stdout )
131133
132134 # test training loss
133- EXCEPTED_LOSS = 11.945683
135+ EXCEPTED_LOSS = 11.945673
134136 self .sfttrain_tester .assert_loss (training_p .stdout , EXCEPTED_LOSS )
135137
136138 # test model resume
@@ -179,7 +181,7 @@ def test_sft_lora(self):
179181 self .sfttrain_tester .assert_result (training_p .returncode , training_p .stdout )
180182
181183 # test training loss
182- EXCEPTED_LOSS = 11.956834
184+ EXCEPTED_LOSS = 11.956829
183185 self .sfttrain_tester .assert_loss (training_p .stdout , EXCEPTED_LOSS )
184186
185187 # test lora merge
@@ -209,3 +211,143 @@ def test_sft_lora(self):
209211 [[22407 , 120525 , 77505 , 113631 , 47887 , 134141 , 122487 , 61092 , 40897 , 40601 ]]
210212 )
211213 self .sfttrain_tester .create_and_check_model_generate (lora_merge_output_dir , EXPECTED_RESULT )
214+
215+ def test_sft_full_tp_pp (self ):
216+ output_dir = os .path .join (OUTPUT_DIR , "sft_full_tp_pp" )
217+ update_args = {
218+ "model_name_or_path" : MODEL_NAME_OR_PATH ,
219+ "train_dataset_path" : "./tests/fixtures/dummy/ernie/sft-train.jsonl" ,
220+ "eval_dataset_path" : "./tests/fixtures/dummy/ernie/sft-train.jsonl" ,
221+ "output_dir" : output_dir ,
222+ }
223+ config_path = os .path .join (CONFIG_PATH , "full_tp_pp.yaml" )
224+ updated_config_path = self .sfttrain_tester .update_training_args (config_path , output_dir , update_args )
225+ train_path = os .path .join (TRAIN_PATH , "run_finetune.py" )
226+ # n_gpus = paddle.device.cuda.device_count()
227+ # devices = ",".join(str(i) for i in range(n_gpus))
228+ cmd = [
229+ "python" ,
230+ "-u" ,
231+ "-m" ,
232+ "paddle.distributed.launch" ,
233+ "--devices" ,
234+ "0,1,2,3" ,
235+ train_path ,
236+ updated_config_path ,
237+ ]
238+ print (f"cmd { cmd } " )
239+ training_p = subprocess .run (cmd , stdout = subprocess .PIPE , stderr = subprocess .STDOUT , text = True )
240+
241+ # test training result
242+ self .sfttrain_tester .assert_result (training_p .returncode , training_p .stdout )
243+
244+ # test training loss
245+ EXCEPTED_LOSS = 11.956903
246+ self .sfttrain_tester .assert_loss (training_p .stdout , EXCEPTED_LOSS )
247+
248+ # test model resume
249+ # EXCEPTED_LOSS = 9.550503
250+ # self.sfttrain_tester.assert_loss(reusme_p.stdout, EXCEPTED_LOSS)
251+ # test model generate
252+ EXPECTED_RESULT = paddle .to_tensor ([[22407 , 90612 , 90612 , 90612 , 90612 , 90612 , 90612 , 90612 , 90612 , 90612 ]])
253+ self .sfttrain_tester .create_and_check_model_generate (output_dir , EXPECTED_RESULT )
254+
255+ def test_sft_lora_tp_pp (self ):
256+ output_dir = os .path .join (OUTPUT_DIR , "sft_lora_tp_pp" )
257+ update_args = {
258+ "model_name_or_path" : MODEL_NAME_OR_PATH ,
259+ "train_dataset_path" : "./tests/fixtures/dummy/ernie/sft-train.jsonl" ,
260+ "eval_dataset_path" : "./tests/fixtures/dummy/ernie/sft-train.jsonl" ,
261+ "output_dir" : output_dir ,
262+ }
263+ config_path = os .path .join (CONFIG_PATH , "lora_tp_pp.yaml" )
264+ updated_config_path = self .sfttrain_tester .update_training_args (config_path , output_dir , update_args )
265+ train_path = os .path .join (TRAIN_PATH , "run_finetune.py" )
266+ cmd = [
267+ "python" ,
268+ "-u" ,
269+ "-m" ,
270+ "paddle.distributed.launch" ,
271+ "--devices" ,
272+ "0,1,2,3" ,
273+ train_path ,
274+ updated_config_path ,
275+ ]
276+ training_p = subprocess .run (cmd , stdout = subprocess .PIPE , stderr = subprocess .STDOUT , text = True )
277+
278+ # test training result
279+ self .sfttrain_tester .assert_result (training_p .returncode , training_p .stdout )
280+
281+ # test training loss
282+ EXCEPTED_LOSS = 11.956903
283+ self .sfttrain_tester .assert_loss (training_p .stdout , EXCEPTED_LOSS )
284+
285+ # test lora merge
286+ lora_merge_output_dir = os .path .join (output_dir , "lora_tp_pp_merge" )
287+ lora_merge_path = os .path .join (TRAIN_PATH , "tools/mergekit.py" )
288+
289+ lora_merge_cmd = [
290+ "python" ,
291+ "-u" ,
292+ "-m" ,
293+ "paddle.distributed.launch" ,
294+ "--devices" ,
295+ "0,1,2,3" ,
296+ lora_merge_path ,
297+ "--lora_model_path" ,
298+ output_dir ,
299+ "--model_name_or_path" ,
300+ MODEL_NAME_OR_PATH ,
301+ "--output_path" ,
302+ lora_merge_output_dir ,
303+ ]
304+ lora_merge_p = subprocess .run (lora_merge_cmd , stdout = subprocess .PIPE , stderr = subprocess .STDOUT , text = True )
305+ self .sfttrain_tester .assert_result (lora_merge_p .returncode , lora_merge_p .stdout )
306+
307+ # test lora_merge_model generate
308+ EXPECTED_RESULT = paddle .to_tensor (
309+ [[22407 , 120525 , 77505 , 113631 , 47887 , 134141 , 122487 , 61092 , 40897 , 40601 ]]
310+ )
311+ self .sfttrain_tester .create_and_check_model_generate (lora_merge_output_dir , EXPECTED_RESULT )
312+
313+ def test_sft_full_function_call (self ):
314+ output_dir = os .path .join (OUTPUT_DIR , "sft_full_function_call" )
315+ update_args = {
316+ "model_name_or_path" : MODEL_NAME_OR_PATH ,
317+ "train_dataset_path" : "./tests/fixtures/dummy/function-call/function-call-train.jsonl" ,
318+ "eval_dataset_path" : "./tests/fixtures/dummy/function-call/function-call-eval.jsonl" ,
319+ "output_dir" : output_dir ,
320+ }
321+ config_path = os .path .join (CONFIG_PATH , "full_function_call.yaml" )
322+ updated_config_path = self .sfttrain_tester .update_training_args (config_path , output_dir , update_args )
323+ train_path = os .path .join (TRAIN_PATH , "run_finetune.py" )
324+ cmd = [
325+ "python" ,
326+ "-u" ,
327+ "-m" ,
328+ "paddle.distributed.launch" ,
329+ "--devices" ,
330+ "0,1,2,3" ,
331+ train_path ,
332+ updated_config_path ,
333+ ]
334+ print (f"cmd { cmd } " )
335+ training_p = subprocess .run (cmd , stdout = subprocess .PIPE , stderr = subprocess .STDOUT , text = True )
336+
337+ # test training result
338+ self .sfttrain_tester .assert_result (training_p .returncode , training_p .stdout )
339+
340+ # test training loss
341+ EXCEPTED_LOSS = 11.769741
342+ self .sfttrain_tester .assert_loss (training_p .stdout , EXCEPTED_LOSS )
343+
344+ # test model resume
345+ # reusme_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
346+ # self.sfttrain_tester.assert_result(reusme_p.returncode, reusme_p.stdout)
347+
348+ # EXCEPTED_LOSS = 9.550503
349+ # self.sfttrain_tester.assert_loss(reusme_p.stdout, EXCEPTED_LOSS)
350+
351+ # test model generate
352+ EXPECTED_RESULT = paddle .to_tensor ([[22407 , 90612 , 90612 , 90612 , 90612 , 90612 , 90612 , 90612 , 90612 , 90612 ]])
353+ self .sfttrain_tester .create_and_check_model_generate (output_dir , EXPECTED_RESULT )
0 commit comments