26
26
--save-folder ./processed \
27
27
--tokenizer-path pretrained_model_name_or_path \
28
28
--prompt-template internlm2_chat \
29
- --dataset-format openai \
30
- --is-ftdp
29
+ --dataset-format ftdp
31
30
32
31
normal json dataset:
33
32
srun -p llm_razor --quotatype=auto --gres=gpu:1 --ntasks=1 \
@@ -48,10 +47,10 @@ def parse_args():
48
47
'--tokenizer-path' , help = 'The path to the hf tokenizer.' )
49
48
parser .add_argument (
50
49
'--dataset-format' ,
51
- choices = DATASET_FORMAT_MAPPING .keys (),
50
+ choices = list ( DATASET_FORMAT_MAPPING .keys ()) + [ 'ftdp' ] ,
52
51
default = None ,
53
- help = 'Which dataset format is this data. '
54
- f'The available choices are { DATASET_FORMAT_MAPPING .keys ()} ' )
52
+ help = 'Which dataset format is this data. The available choices are '
53
+ f" { list ( DATASET_FORMAT_MAPPING .keys ()) + [ 'ftdp' ] } . " )
55
54
parser .add_argument (
56
55
'--prompt-template' ,
57
56
choices = PROMPT_TEMPLATE .keys (),
@@ -67,10 +66,6 @@ def parse_args():
67
66
'--file-type' ,
68
67
default = '.json' ,
69
68
help = 'We want to get the order of the file in this type.' )
70
- parser .add_argument (
71
- '--is-ftdp' ,
72
- action = 'store_true' ,
73
- help = 'Whether it is in ftdp data format' )
74
69
parser .add_argument (
75
70
'--data-order-path' ,
76
71
default = None ,
@@ -168,15 +163,22 @@ def process_untokenized_dataset(folder,
168
163
pretrained_model_name_or_path = args .tokenizer_path ,
169
164
trust_remote_code = True ,
170
165
padding_side = 'right' )
166
+
167
+ if args .dataset_format is None :
168
+ dataset_map_fn = None
169
+ elif args .dataset_format == 'ftdp' :
170
+ dataset_map_fn = DATASET_FORMAT_MAPPING ['openai' ]
171
+ else :
172
+ dataset_map_fn = DATASET_FORMAT_MAPPING [args .dataset_format ]
173
+
171
174
datasets_dict = process_untokenized_dataset (
172
175
args .data_folder ,
173
176
tokenizer ,
174
177
args .max_length ,
175
178
args .pack_to_max_length ,
176
- DATASET_FORMAT_MAPPING [args .dataset_format ]
177
- if args .dataset_format is not None else None ,
179
+ dataset_map_fn ,
178
180
PROMPT_TEMPLATE [args .prompt_template ],
179
181
data_order_path = args .data_order_path ,
180
182
file_type = args .file_type ,
181
- is_ftdp = args .is_ftdp )
183
+ is_ftdp = args .dataset_format == 'ftdp' )
182
184
datasets_dict .save_to_disk (args .save_folder )
0 commit comments