-
Notifications
You must be signed in to change notification settings - Fork 3
/
predict_after.py
141 lines (131 loc) · 6.48 KB
/
predict_after.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import torch
from data_loader import *
from dataprocess_predict import *
from torch.utils.data import DataLoader
# from AlexNet_206_p import *
# from AlexNet_206 import *
import time
import json
class predict():
TOKENIZERS_PARALLELISM= False
def __init__(self, genotype_path, trait_for_predict, save_path, Redis, taskID="", if_all = False):
self.Redis = Redis
self.path = r'./predict/weight'
self.taskID = taskID
self.progressdict = {"title": "", "progress": "", "predict_finish": False}
self.taskdict = {"result": pd.DataFrame(), "page": 0, "total_pages": 0, "col_names": []}
self.IsMissing = False
# Build a list of traits to predict
n_trait = ['protein', 'oil', 'SdWgt', 'Yield', 'R8', 'R1', 'Hgt', 'Linoleic', 'Linolenic', 'Palmitic', 'Stearic', 'Oleic']
p_trait = ['MG', 'SQ', 'ST', 'Ldg', 'P_CLR', 'Mot', 'P_FRM', 'SC_L', 'SC_CLR', 'Stem term', 'H_CLR', 'PRR1', 'SCN3', 'FC', 'P_DENS', 'POD']
if not if_all:
self.n_trait = []
self.p_trait = []
for trait in trait_for_predict:
if trait in n_trait:
self.n_trait.append(trait)
elif trait in p_trait:
self.p_trait.append(trait)
else:
print("Error:Couldn't find target trait!")
self.trait_list = trait_for_predict
else:
self.n_trait, self.p_trait = n_trait, p_trait
self.trait_list = self.n_trait + self.p_trait
# Read genotype file and storage path
self.vcf_path = rf'{genotype_path}'
self.save_dir = rf'{save_path}'
# Get the most value of each trait, the dictionary of the corresponding category number and convert it into a list
max_min = pd.read_csv(r'./predict/n_trait.txt',header=None)
p_dict = open(r'./predict/p_trait.txt','r').readlines()
self.p_data = [i.strip() for i in p_dict]
self.n_data = np.array(max_min.iloc[:]).tolist()
self.forward()
def insertRedis(self):
msg = json.dumps({self.taskID: self.progressdict})
self.Redis.set('progressdict', msg)
def insertTaskRedis(self):
msg = json.dumps({self.taskID: self.taskdict})
self.Redis.set('taskdict', msg)
def timer(self):
return time.time()
def forward(self):
t1 = self.timer()
# data preprocessing
data_list = data_process(self.vcf_path, self.Redis, self.taskID)
# Return the processed data and a list of samples that need to be predicted
predict_data,sample_list = data_list.to_dataset()
self.IsMissing = data_list.IsMissing
# Construct iterator
loader = DataLoader(data_loader(predict_data),batch_size=1,shuffle=False,num_workers=0)
result = {}
t2 = self.timer()
# print(f'Data process has done! Use time:{t2-t1}','\n','Start data predict')
# Make predictions for all traits in the trait list
for index, (feature) in enumerate(loader):
feature = feature.to('cuda:0')
het = []
self.progressdict['progress'] = f"{(index+1) / len(sample_list) * 100:.2f}%"
self.insertRedis()
# print(f"({index+1} / {len(sample_list)})-------{(index+1) / len(sample_list) * 100:.2f}%")
for trait in self.trait_list:
self.progressdict['title'] = f"Predicting: Sample {index + 1} ({index+1} / {len(sample_list)})'s trait {trait}"
self.insertRedis()
weight_path = os.path.join(self.path, f'{trait}_best.pt')
net = torch.load(weight_path, map_location="cuda:0")
net.eval()
y_het = net(feature)
# If it is a quality trait, return the index of the class with the highest probability in the predicted value
if trait in self.p_trait:
y_het = np.argmax(y_het.to('cpu').detach().numpy(),axis=1)
# Add each trait of the sample to the list
het.append(y_het[0])
else:
het.append(y_het.to('cpu').detach().numpy()[0][0])
del net
del y_het
torch.cuda.empty_cache()
# Build a result dictionary, key-value pairs: Sample: [trait 1, trait 2...]
result[sample_list[index]] = het
torch.cuda.empty_cache()
t3 = self.timer()
# print(r'Predict has done! Use time:{t3-t2}','\n','Start data restore')
# Convert the result list into a dataframe and then transpose, the row index is the sample ID, and the column index is the predicted value of the trait
result = pd.DataFrame(result).transpose()
result.columns = self.trait_list
# Restore the normalized data of quantitative traits and the classified data of qualitative traits
traitnum = 0
for trait in self.trait_list:
traitnum += 1
self.progressdict['title'] = f"Restoring trait data: {trait}"
self.progressdict['progress'] = f"{(traitnum / len(self.trait_list) * 100):.2f}%"
self.insertRedis()
# Quality traits normalized data for reduction
if trait in self.n_trait:
for i in self.n_data:
print(i)
if trait in i[0]:
max_of_trait,min_of_trait = float(i[0].split(";")[2]),float(i[0].split(";")[4])
break
result[trait] = result[trait]*(max_of_trait - min_of_trait) + min_of_trait
# According to the traits generated during the data preprocessing process: category number dictionary, the map restores the traits
else:
for i in self.p_data:
if trait in i:
dic = eval("{" + i.split("{")[1])
break
dic = dict(zip(dic.values(), dic.keys()))
result[trait] = result[trait].map(dic)
# Store forecast data as csv
result.index.name = "acid"
result = result.reset_index()
self.taskdict['result'] = result.to_json()
# pass the result out
self.insertTaskRedis()
result.to_csv(os.path.join(self.save_dir, 'predict.csv'), index=False)
self.progressdict['title'] = "Finish"
self.progressdict['progress'] = "100%"
self.insertRedis()
t4 = self.timer()
print(f'Restore has done! Use time:{t4-t3} Result has saved in save path')