-
Notifications
You must be signed in to change notification settings - Fork 29
/
generate_test_data.py
67 lines (58 loc) · 2.01 KB
/
generate_test_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import argparse
from dataclasses import dataclass
import os
from typing import List
import torch
import activation_dataset
from config import BaseArgs
from transformer_lens import HookedTransformer
from transformers import AutoTokenizer
@dataclass
class GenTestArgs(BaseArgs):
model: str = "EleutherAI/pythia-70m-deduped"
n_chunks: int = 1
skip_chunks: int = 0
chunk_size_gb: float = 2.0
activation_chunk_size = 8192 * 1024
dataset: str = "NeelNanda/pile-10k"
layers: List[int] = [2]
location: str = "residual"
locations: List[str] = ["residual"]
dataset_folder: str = "activation_data"
layer_folder_fmt: str = "layer_{layer}"
device: str = "cuda:0"
use_tl: bool = True
if __name__ == "__main__":
args = GenTestArgs()
device = torch.device(args.device)
if args.use_tl:
model = HookedTransformer.from_pretrained(args.model, device=args.device)
tokenizer = AutoTokenizer.from_pretrained(args.model)
layer_folders = [args.layer_folder_fmt.format(layer=layer) for layer in args.layers]
os.makedirs(args.dataset_folder, exist_ok=True)
for layer_folder in layer_folders:
os.makedirs(os.path.join(args.dataset_folder, layer_folder), exist_ok=True)
dataset_folders = [os.path.join(args.dataset_folder, layer_folder) for layer_folder in layer_folders]
activation_dataset.setup_data(
tokenizer,
model,
args.dataset,
dataset_folders,
layer=args.layers,
layer_loc=args.location,
n_chunks=args.n_chunks,
chunk_size_gb=args.chunk_size_gb,
device=device,
skip_chunks=args.skip_chunks,
)
else:
activation_dataset.setup_data_new(
args.model,
args.dataset,
args.dataset_folder,
args.locations,
args.activation_chunk_size,
args.n_chunks,
skip_chunks=args.skip_chunks,
device=device,
)