Skip to content

Commit 1793f01

Browse files
authored
Add files via upload
1 parent 3117319 commit 1793f01

File tree

6 files changed

+492
-0
lines changed

6 files changed

+492
-0
lines changed

lora/ds_config.json

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"train_batch_size": "auto",
3+
"train_micro_batch_size_per_gpu" :"auto",
4+
"gradient_accumulation_steps": "auto",
5+
"gradient_clipping": 1.0,
6+
"bf16": {
7+
"enabled": "auto"
8+
},
9+
"zero_optimization": {
10+
"stage": 3,
11+
"overlap_comm": true,
12+
"stage3_gather_16bit_weights_on_model_save": true
13+
},
14+
"flops_profiler": {
15+
"enabled": false,
16+
"profile_step": 1,
17+
"module_depth": -1,
18+
"top_modules": 1,
19+
"detailed": true,
20+
"output_file": null
21+
}
22+
}

lora/evaluate.sh

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
python infer-tune.py \
3+
--data_path ./data/pkumod-ccks_query_list_test4.txt\
4+
--output_dir output

lora/fine-tune.py

+252
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
import os
2+
import math
3+
import pathlib
4+
from typing import Optional, Dict
5+
from dataclasses import dataclass, field
6+
import json
7+
import time
8+
9+
import torch
10+
from torch.utils.data import Dataset
11+
import transformers
12+
from transformers.training_args import TrainingArguments
13+
14+
os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "gloo"
15+
os.environ["WANDB_DISABLED"] = "true"
16+
17+
18+
@dataclass
19+
class ModelArguments:
20+
model_name_or_path: Optional[str] = field(default=r"E:\pretraing_models\torch\baichuan2-7B-Chat")
21+
22+
23+
@dataclass
24+
class DataArguments:
25+
data_path: str = field(
26+
default=None, metadata={"help": "Path to the training data."}
27+
)
28+
max_source_length: int = field(
29+
default=1000,
30+
metadata={
31+
"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
32+
},
33+
)
34+
max_target_length: int = field(
35+
default=200,
36+
metadata={
37+
"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
38+
},
39+
)
40+
41+
42+
43+
@dataclass
44+
class TrainingArguments(transformers.TrainingArguments):
45+
cache_dir: Optional[str] = field(default=None)
46+
optim: str = field(default="adamw_torch")
47+
use_lora: bool = field(default=True)
48+
model_max_length: int = field(
49+
default=1201,
50+
metadata={
51+
"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
52+
},
53+
)
54+
55+
56+
57+
class SupervisedDataset(Dataset):
58+
"""Dataset for supervised fine-tuning."""
59+
60+
def __init__(
61+
self,
62+
data_path,
63+
tokenizer,
64+
model_source_length,
65+
user_tokens=[195],
66+
assistant_tokens=[196],
67+
):
68+
super(SupervisedDataset, self).__init__()
69+
self.data = json.load(open(data_path))
70+
self.tokenizer = tokenizer
71+
self.model_max_length = model_max_length
72+
self.user_tokens = user_tokens
73+
self.assistant_tokens = assistant_tokens
74+
self.ignore_index = -100
75+
item = self.preprocessing(self.data[120])
76+
# print("input:", self.tokenizer.decode(item["input_ids"]))
77+
labels = []
78+
for id_ in item["labels"]:
79+
if id_ == -100:
80+
continue
81+
labels.append(id_)
82+
print("label:", self.tokenizer.decode(labels))
83+
84+
def __len__(self):
85+
return len(self.data)
86+
87+
def preprocessing(self, example):
88+
input_ids = []
89+
labels = []
90+
91+
for message in example["conversations"]:
92+
from_ = message["from"]
93+
value = message["value"]
94+
value_ids = self.tokenizer.encode(value)
95+
96+
if from_ == "human":
97+
input_ids += self.user_tokens + value_ids
98+
labels += [self.tokenizer.eos_token_id] + [self.ignore_index] * len(
99+
value_ids
100+
)
101+
else:
102+
input_ids += self.assistant_tokens + value_ids
103+
labels += [self.ignore_index] + value_ids
104+
input_ids.append(self.tokenizer.eos_token_id)
105+
labels.append(self.tokenizer.eos_token_id)
106+
input_ids = input_ids[: self.model_max_length]
107+
labels = labels[: self.model_max_length]
108+
input_ids += [self.tokenizer.pad_token_id] * (
109+
self.model_max_length - len(input_ids)
110+
)
111+
labels += [self.ignore_index] * (self.model_max_length - len(labels))
112+
input_ids = torch.LongTensor(input_ids)
113+
labels = torch.LongTensor(labels)
114+
attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
115+
return {
116+
"input_ids": input_ids,
117+
"labels": labels,
118+
"attention_mask": attention_mask,
119+
}
120+
121+
def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
122+
return self.preprocessing(self.data[idx])
123+
124+
125+
126+
class MySupervisedDataset(Dataset):
127+
"""Dataset for supervised fine-tuning."""
128+
129+
def __init__(
130+
self,
131+
data_path,
132+
tokenizer,
133+
max_source_length,
134+
max_target_length,
135+
max_seq_length
136+
):
137+
super(MySupervisedDataset, self).__init__()
138+
self.data = self.load_data(data_path)
139+
self.tokenizer = tokenizer
140+
self.max_source_length = max_source_length
141+
self.max_target_length = max_target_length
142+
self.max_seq_length = max_seq_length
143+
self.ignore_index = -100
144+
item = self.preprocessing(self.data[1])
145+
print("input:", self.tokenizer.decode(item["input_ids"]))
146+
labels = []
147+
for id_ in item["labels"]:
148+
if id_ == -100:
149+
continue
150+
labels.append(id_)
151+
print("label:", self.tokenizer.decode(labels))
152+
153+
def load_data(self,data_path):
154+
D = []
155+
with open(data_path,'r',encoding='utf-8') as f:
156+
for line in f :
157+
line = json.loads(line)
158+
D.append(line)
159+
return D
160+
161+
def __len__(self):
162+
return len(self.data)
163+
164+
def preprocessing(self, example):
165+
input_ids = []
166+
labels = []
167+
168+
prompt, answer = example['instruction'], example['output']
169+
170+
a_ids = self.tokenizer.encode(text=prompt, add_special_tokens=True, truncation=True,
171+
max_length=self.max_source_length)
172+
b_ids = self.tokenizer.encode(text=answer, add_special_tokens=False, truncation=True,
173+
max_length=self.max_target_length)
174+
175+
context_length = len(a_ids)
176+
input_ids = a_ids + b_ids + [self.tokenizer.eos_token_id]
177+
labels = [self.tokenizer.pad_token_id] * context_length + b_ids + [self.tokenizer.eos_token_id]
178+
179+
pad_len = self.max_seq_length - len(input_ids)
180+
input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len
181+
labels = labels + [self.tokenizer.pad_token_id] * pad_len
182+
labels = [(l if l != self.tokenizer.pad_token_id else -100) for l in labels]
183+
184+
input_ids = torch.LongTensor(input_ids)
185+
labels = torch.LongTensor(labels)
186+
attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
187+
return {
188+
"input_ids": input_ids,
189+
"labels": labels,
190+
"attention_mask": attention_mask,
191+
}
192+
193+
194+
def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
195+
return self.preprocessing(self.data[idx])
196+
197+
198+
199+
200+
201+
202+
203+
def train():
204+
parser = transformers.HfArgumentParser(
205+
(ModelArguments, DataArguments, TrainingArguments)
206+
)
207+
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
208+
209+
model = transformers.AutoModelForCausalLM.from_pretrained(
210+
model_args.model_name_or_path,
211+
trust_remote_code=True,
212+
cache_dir=training_args.cache_dir,
213+
).half()
214+
215+
tokenizer = transformers.AutoTokenizer.from_pretrained(
216+
model_args.model_name_or_path,
217+
use_fast=False,
218+
trust_remote_code=True,
219+
model_max_length=training_args.model_max_length,
220+
cache_dir=training_args.cache_dir,
221+
)
222+
if training_args.use_lora:
223+
from peft import LoraConfig, TaskType, get_peft_model
224+
225+
peft_config = LoraConfig(
226+
task_type=TaskType.CAUSAL_LM,
227+
target_modules=["W_pack"],
228+
inference_mode=False,
229+
r=8,
230+
lora_alpha=32,
231+
lora_dropout=0.1,
232+
)
233+
model.enable_input_require_grads()
234+
model = get_peft_model(model, peft_config)
235+
model.print_trainable_parameters()
236+
237+
dataset = MySupervisedDataset(
238+
data_args.data_path, tokenizer, data_args.max_source_length,data_args.max_target_length,training_args.model_max_length
239+
)
240+
241+
trainer = transformers.Trainer(
242+
model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer
243+
)
244+
245+
trainer.train()
246+
trainer.save_state()
247+
trainer.save_model(output_dir=training_args.output_dir)
248+
249+
250+
if __name__ == "__main__":
251+
train()
252+

0 commit comments

Comments
 (0)