-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathtokenize-dataset.py
116 lines (97 loc) · 3.58 KB
/
tokenize-dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os, argparse
import numpy as np
from xval.preprocess import extract_all_keys, tokenize_fnc, convert_num_string
from xval.tokenizer import make_tokenizer
from datasets import DatasetDict
from transformers import PreTrainedTokenizerFast
import yaml
parser = argparse.ArgumentParser()
parser.add_argument("--datapath", type=str, help="Path to data")
parser.add_argument("--savepath", type=str, help="Save path for tokenizer and tokenized dataset.")
parser.add_argument("--encoding", type=str, choices=['xval', 'fp15', 'p10', 'p1000', 'b1999'], default='xval', help="Choose your encoding scheme. (xVal is ours.)")
args = parser.parse_args()
data_path = args.datapath
save_path = args.savepath
print(f"Using encoding: {args.encoding}")
if save_path is None:
save_path = data_path # use same path by default
files = [f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f)) and "test" in f or "train" in f or "val" in f and "json" not in f and "token" not in f and "yaml" not in f]
print(f"\nLoading dataset from {data_path} with splits: {files}")
ds = DatasetDict.from_text({file: data_path + "/" + file for file in files})
print("\nExtracting keys from the train set...")
ds_keys = ds["train"].map(
lambda x: {"keys": extract_all_keys(x["text"])},
num_proc=30,
remove_columns=["text"],
load_from_cache_file=False,
)
sample_keys = list(set([item for sublist in ds_keys["keys"] for item in sublist]))
print(f"\nExtracted keys from dataset: {sample_keys}\n")
tokenizer_path = save_path + "tokenizer_{}.json".format(args.encoding)
os.makedirs(save_path, exist_ok=True)
make_tokenizer(
encoding=args.encoding,
save_file=tokenizer_path,
efficient_json=True,
sample_keys=sample_keys
)
tokenizer = PreTrainedTokenizerFast(
tokenizer_file=tokenizer_path,
bos_token="[END]",
eos_token="[END]",
mask_token="[MASK]",
pad_token="[PAD]",
)
print("Tokenizer saved to: ", tokenizer_path)
print("Vocab size: ", len(tokenizer.vocab))
print("\nStarting tokenization...")
if args.encoding == 'xval':
tokenize_lambda = lambda x: tokenize_fnc(x, tokenizer)
batched = False
batch_size = None
else:
def tokenize_lambda(samples):
out = []
for sample in samples["text"]:
out.append(tokenizer.encode(convert_num_string(sample)))
return {"input_ids": out}
batched = True
batch_size = 100
tokenized_ds = ds.map(
tokenize_lambda,
batched=batched,
num_proc=30,
batch_size=batch_size,
remove_columns=["text"],
load_from_cache_file=False,
)
if args.encoding == "xval":
max_len = max([max(tokenized_ds[key]["len"]) for key in tokenized_ds.keys()])
tokenized_ds = DatasetDict(
{key: val.remove_columns(["len"]) for key, val in tokenized_ds.items()}
)
else:
lens = tokenized_ds["train"].map(
lambda x: {"len": len(x["input_ids"])},
num_proc=30,
remove_columns="input_ids",
load_from_cache_file=False,
)["len"]
max_len = max(lens)
print(f"Longest sequence length: {max_len}")
print("\nTokenization finished. Saving...")
full_save_path = save_path + "/tokenized_ds_"+str(args.encoding)
tokenized_ds.save_to_disk(full_save_path)
print("Tokenized dataset saved to: ", full_save_path)
config = {
"vocab_size": len(tokenizer.vocab),
"block_size": max_len,
"tokenizer": tokenizer_path,
"dataset": full_save_path,
"dataset_type": "hf_fullsample_numbers_mlm",
"mask_token": "[MASK]",
}
config_path = save_path + f"/config_{args.encoding}.yaml"
with open(config_path, "w") as file:
yaml.dump(config, file)
print("\nConfiguration saved to: ", config_path)