|
| 1 | +from dataclasses import dataclass, field |
| 2 | +from typing import Optional |
| 3 | + |
| 4 | + |
| 5 | +@dataclass |
| 6 | +class ModelArguments: |
| 7 | + """ |
| 8 | + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. |
| 9 | + """ |
| 10 | + |
| 11 | + model_name_or_path: str = field( |
| 12 | + default = r'E:\pretraing_models\torch\chatglm2_6b', |
| 13 | + # default = r'E:\pretraing_models\torch\chatglm3-6b-base', |
| 14 | + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} |
| 15 | + ) |
| 16 | + ptuning_checkpoint: str = field( |
| 17 | + default=None, metadata={"help": "Path to p-tuning v2 checkpoints"} |
| 18 | + ) |
| 19 | + config_name: Optional[str] = field( |
| 20 | + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} |
| 21 | + ) |
| 22 | + tokenizer_name: Optional[str] = field( |
| 23 | + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} |
| 24 | + ) |
| 25 | + cache_dir: Optional[str] = field( |
| 26 | + default=None, |
| 27 | + metadata={"help": "Where to store the pretrained models " |
| 28 | + "wnloaded from huggingface.co"}, |
| 29 | + ) |
| 30 | + use_fast_tokenizer: bool = field( |
| 31 | + default=True, |
| 32 | + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, |
| 33 | + ) |
| 34 | + model_revision: str = field( |
| 35 | + default="main", |
| 36 | + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, |
| 37 | + ) |
| 38 | + use_auth_token: bool = field( |
| 39 | + default=False, |
| 40 | + metadata={ |
| 41 | + "help": ( |
| 42 | + "Will use the token generated when running `huggingface-cli login` (necessary to use this script " |
| 43 | + "with private models)." |
| 44 | + ) |
| 45 | + }, |
| 46 | + ) |
| 47 | + resize_position_embeddings: Optional[bool] = field( |
| 48 | + default=None, |
| 49 | + metadata={ |
| 50 | + "help": ( |
| 51 | + "Whether to automatically resize the position embeddings if `max_source_length` exceeds " |
| 52 | + "the model's position embeddings." |
| 53 | + ) |
| 54 | + }, |
| 55 | + ) |
| 56 | + quantization_bit: Optional[int] = field( |
| 57 | + default=None |
| 58 | + ) |
| 59 | + pre_seq_len: Optional[int] = field( |
| 60 | + default=None |
| 61 | + ) |
| 62 | + prefix_projection: bool = field( |
| 63 | + default=False |
| 64 | + ) |
| 65 | + |
| 66 | + |
| 67 | +@dataclass |
| 68 | +class DataTrainingArguments: |
| 69 | + """ |
| 70 | + Arguments pertaining to what data we are going to input our model for training and eval. |
| 71 | + """ |
| 72 | + |
| 73 | + lang: Optional[str] = field(default=None, metadata={"help": "Language id for summarization."}) |
| 74 | + |
| 75 | + dataset_name: Optional[str] = field( |
| 76 | + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} |
| 77 | + ) |
| 78 | + dataset_config_name: Optional[str] = field( |
| 79 | + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} |
| 80 | + ) |
| 81 | + prompt_column: Optional[str] = field( |
| 82 | + default=None, |
| 83 | + metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."}, |
| 84 | + ) |
| 85 | + response_column: Optional[str] = field( |
| 86 | + default=None, |
| 87 | + metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."}, |
| 88 | + ) |
| 89 | + history_column: Optional[str] = field( |
| 90 | + default=None, |
| 91 | + metadata={"help": "The name of the column in the datasets containing the history of chat."}, |
| 92 | + ) |
| 93 | + train_file: Optional[str] = field( |
| 94 | + default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."} |
| 95 | + ) |
| 96 | + validation_file: Optional[str] = field( |
| 97 | + default=None, |
| 98 | + metadata={ |
| 99 | + "help": ( |
| 100 | + "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)." |
| 101 | + ) |
| 102 | + }, |
| 103 | + ) |
| 104 | + test_file: Optional[str] = field( |
| 105 | + default=None, |
| 106 | + metadata={ |
| 107 | + "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)." |
| 108 | + }, |
| 109 | + ) |
| 110 | + overwrite_cache: bool = field( |
| 111 | + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} |
| 112 | + ) |
| 113 | + preprocessing_num_workers: Optional[int] = field( |
| 114 | + default=None, |
| 115 | + metadata={"help": "The number of processes to use for the preprocessing."}, |
| 116 | + ) |
| 117 | + max_source_length: Optional[int] = field( |
| 118 | + default=1024, |
| 119 | + metadata={ |
| 120 | + "help": ( |
| 121 | + "The maximum total input sequence length after tokenization. Sequences longer " |
| 122 | + "than this will be truncated, sequences shorter will be padded." |
| 123 | + ) |
| 124 | + }, |
| 125 | + ) |
| 126 | + max_target_length: Optional[int] = field( |
| 127 | + default=128, |
| 128 | + metadata={ |
| 129 | + "help": ( |
| 130 | + "The maximum total sequence length for target text after tokenization. Sequences longer " |
| 131 | + "than this will be truncated, sequences shorter will be padded." |
| 132 | + ) |
| 133 | + }, |
| 134 | + ) |
| 135 | + val_max_target_length: Optional[int] = field( |
| 136 | + default=None, |
| 137 | + metadata={ |
| 138 | + "help": ( |
| 139 | + "The maximum total sequence length for validation target text after tokenization. Sequences longer " |
| 140 | + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." |
| 141 | + "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " |
| 142 | + "during ``evaluate`` and ``predict``." |
| 143 | + ) |
| 144 | + }, |
| 145 | + ) |
| 146 | + pad_to_max_length: bool = field( |
| 147 | + default=False, |
| 148 | + metadata={ |
| 149 | + "help": ( |
| 150 | + "Whether to pad all samples to model maximum sentence length. " |
| 151 | + "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " |
| 152 | + "efficient on GPU but very bad for TPU." |
| 153 | + ) |
| 154 | + }, |
| 155 | + ) |
| 156 | + max_train_samples: Optional[int] = field( |
| 157 | + default=None, |
| 158 | + metadata={ |
| 159 | + "help": ( |
| 160 | + "For debugging purposes or quicker training, truncate the number of training examples to this " |
| 161 | + "value if set." |
| 162 | + ) |
| 163 | + }, |
| 164 | + ) |
| 165 | + max_eval_samples: Optional[int] = field( |
| 166 | + default=None, |
| 167 | + metadata={ |
| 168 | + "help": ( |
| 169 | + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " |
| 170 | + "value if set." |
| 171 | + ) |
| 172 | + }, |
| 173 | + ) |
| 174 | + max_predict_samples: Optional[int] = field( |
| 175 | + default=None, |
| 176 | + metadata={ |
| 177 | + "help": ( |
| 178 | + "For debugging purposes or quicker training, truncate the number of prediction examples to this " |
| 179 | + "value if set." |
| 180 | + ) |
| 181 | + }, |
| 182 | + ) |
| 183 | + num_beams: Optional[int] = field( |
| 184 | + default=None, |
| 185 | + metadata={ |
| 186 | + "help": ( |
| 187 | + "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, " |
| 188 | + "which is used during ``evaluate`` and ``predict``." |
| 189 | + ) |
| 190 | + }, |
| 191 | + ) |
| 192 | + ignore_pad_token_for_loss: bool = field( |
| 193 | + default=True, |
| 194 | + metadata={ |
| 195 | + "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not." |
| 196 | + }, |
| 197 | + ) |
| 198 | + source_prefix: Optional[str] = field( |
| 199 | + default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."} |
| 200 | + ) |
| 201 | + |
| 202 | + forced_bos_token: Optional[str] = field( |
| 203 | + default=None, |
| 204 | + metadata={ |
| 205 | + "help": ( |
| 206 | + "The token to force as the first generated token after the decoder_start_token_id." |
| 207 | + "Useful for multilingual models like mBART where the first generated token" |
| 208 | + "needs to be the target language token (Usually it is the target language token)" |
| 209 | + ) |
| 210 | + }, |
| 211 | + ) |
| 212 | + |
| 213 | + |
| 214 | + |
| 215 | + def __post_init__(self): |
| 216 | + if self.dataset_name is None and self.train_file is None and self.validation_file is None and self.test_file is None: |
| 217 | + raise ValueError("Need either a dataset name or a training/validation/test file.") |
| 218 | + else: |
| 219 | + if self.train_file is not None: |
| 220 | + extension = self.train_file.split(".")[-1] |
| 221 | + assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." |
| 222 | + if self.validation_file is not None: |
| 223 | + extension = self.validation_file.split(".")[-1] |
| 224 | + assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." |
| 225 | + if self.val_max_target_length is None: |
| 226 | + self.val_max_target_length = self.max_target_length |
| 227 | + |
0 commit comments