-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfigs.py
77 lines (67 loc) · 2.9 KB
/
configs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def config_JointEmbeder():
conf = {
# data_params
'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
#training data
'train_name':'train.name.h5',
'train_api':'train.apiseq.h5',
'train_tokens':'train.tokens.h5',
'train_desc':'train.desc.h5',
#test data
'valid_name':'test.name.h5',
'valid_api':'test.apiseq.h5',
'valid_tokens':'test.tokens.h5',
'valid_desc':'test.desc.h5',
'pool_size':100,
'top_k':20,
#use data (computing code vectors)
'use_codebase':'use.rawcode.txt',#'use.rawcode.h5'
'use_names':'use.name.h5',
'use_apis':'use.apiseq.h5',
'use_tokens':'use.tokens.h5',
#results data(code vectors)
'use_codevecs':'use.codevecs.h5',
#parameters
'name_len': 6,
'api_len':30,
'tokens_len':50,
'desc_len': 30,
'n_words': 10000, # len(vocabulary) + 1
#vocabulary info
'vocab_name':'vocab.name.json',
'vocab_api':'vocab.apiseq.json',
'vocab_tokens':'vocab.tokens.json',
'vocab_desc':'vocab.desc.json',
#training_params
'batch_size': 1,
'chunk_size':200000,
'nb_epoch': 500,
#'optimizer': 'adam',
'learning_rate':2.08e-4,
'adam_epsilon':1e-8,
'warmup_steps':5000,
'fp16': False,
'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
#"See details at https://nvidia.github.io/apex/amp.html"
# model_params
'emb_size': 300,
'n_hidden': 300,#number of hidden dimension of code/desc representation
# recurrent
'lstm_dims': 256, # * 2
'margin': 0.3986,
'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
#cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
# hybrid-transformer
'N': 2,
'd_model': 300, # number of hidden dimension of code/desc representation
# recurrent
'd_ff': 300, # * 2
'k': 5,
'h': 6,
'num_features': 3,
'max_simple_name_len': 30,
'dropout': 0.1,
# similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
# cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
}
return conf