Skip to content

Commit 61873a0

Browse files
committed
failed
1 parent ec02d6d commit 61873a0

18 files changed

+1834
-0
lines changed

failed_attempts/RNN/e2vtrain.py

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import json
2+
from gensim.models import word2vec
3+
4+
# sentences=word2vec.LineSentence("/home/ubuntu/thesiswork/source/corpus/fullcorpus10000.txt")
5+
sentences=word2vec.LineSentence("/home/ubuntu/thesiswork/source/corpus/corpus_all.txt")
6+
# sentences=word2vec.LineSentence("/home/ubuntu/results/hivo_corpus.txt")
7+
8+
f=open("/home/ubuntu/results/statistics/tf_all.json",'r')
9+
tf_all=json.load(f)
10+
f.close()
11+
12+
tf_all_com={}
13+
for k in tf_all.keys():
14+
_k=k.split("#")[1]
15+
tf_all_com[_k]=tf_all[k]+1e-10
16+
17+
# model=word2vec.Word2Vec(sg=0,size=128,window=10,min_count=0,sample=1e-3,hs=0,negative=1,sorted_vocab=1)
18+
# model.build_vocab_from_freq(tf_all_com)
19+
# model.train(sentences,total_examples=10000,epochs=100)
20+
# path="/home/ubuntu/results/models/e2v_sg_5000_e100_d128.model"
21+
# # path="/home/ubuntu/results/e2v_sg_e100.model"
22+
# model.save(path)
23+
24+
# path="/home/ubuntu/results/models/e2v_sg_10000_e200_d64.model"
25+
# model=word2vec.Word2Vec.load(path)
26+
27+
# ======
28+
29+
model=word2vec.Word2Vec(sg=0,size=64,window=10,min_count=0,sample=1e-3,hs=0,negative=5,workers=4,sorted_vocab=1,compute_loss=True)
30+
model.build_vocab_from_freq(tf_all_com)
31+
model.train(sentences,total_examples=140000,epochs=200)
32+
path="/home/ubuntu/results/models/e2v_sg_140k_e200_d64.model"
33+
# path="/home/ubuntu/results/e2v_sg_e100.model"
34+
model.save(path)
35+
36+
# ======
37+
38+
# model=word2vec.Word2Vec(sg=0,size=32,window=10,min_count=0,sample=1e-3,hs=0,negative=1,sorted_vocab=1)
39+
# model.build_vocab_from_freq(tf_all_com)
40+
# model.train(sentences,total_examples=10000,epochs=100)
41+
# path="/home/ubuntu/results/models/e2v_sg_5000_e100_d32.model"
42+
# # path="/home/ubuntu/results/e2v_sg_e100.model"
43+
# model.save(path)
44+
45+
# model=word2vec.Word2Vec(sg=1,size=128,window=10,min_count=0,sample=1e-3,hs=0,negative=1,sorted_vocab=1)
46+
# model.build_vocab_from_freq(tf_all_com)
47+
# model.train(sentences,total_examples=10000,epochs=100)
48+
# path="/home/ubuntu/results/models/e2v_cbow_5000_e100_d128.model"
49+
# # path="/home/ubuntu/results/e2v_cbow_e100.model"
50+
# model.save(path)
51+
52+
# ======
53+
54+
model=word2vec.Word2Vec(sg=1,size=64,window=10,min_count=0,sample=1e-3,hs=0,negative=5,workers=4,sorted_vocab=1,compute_loss=True)
55+
model.build_vocab_from_freq(tf_all_com)
56+
model.train(sentences,total_examples=140000,epochs=200)
57+
path="/home/ubuntu/results/models/e2v_cbow_140k_e200_d64.model"
58+
# path="/home/ubuntu/results/e2v_cbow_e100.model"
59+
model.save(path)
60+
61+
# ======
62+
63+
# model=word2vec.Word2Vec(sg=1,size=32,window=10,min_count=0,sample=1e-3,hs=0,negative=1,sorted_vocab=1)
64+
# model.build_vocab_from_freq(tf_all_com)
65+
# model.train(sentences,total_examples=10000,epochs=100)
66+
# path="/home/ubuntu/results/models/e2v_cbow_5000_e100_d32.model"
67+
# # path="/home/ubuntu/results/e2v_cbow_e100.model"
68+
# model.save(path)

failed_attempts/RNN/emb_evaluation.py

+157
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
2+
# coding: utf-8
3+
4+
# In[23]:
5+
6+
7+
import json
8+
import numpy as np
9+
from gensim.models import word2vec as w2v
10+
11+
12+
# In[24]:
13+
14+
15+
f=open("/home/ubuntu/thesiswork/source/coded_syns.json",'r')
16+
coded_syns=json.load(f)
17+
f.close()
18+
19+
20+
# In[25]:
21+
22+
23+
def load_models():
24+
path="/home/ubuntu/results/models/e2v_sg_140k_e200_d64.model"
25+
e2v_model=w2v.Word2Vec.load(path)
26+
f=open("/home/ubuntu/results/ontology/KG_n2v_d64.json",'r')
27+
n2v_model=json.load(f)
28+
f.close()
29+
return e2v_model,n2v_model
30+
31+
e2v_model,n2v_model=load_models()
32+
33+
def load_sups():
34+
f=open("/home/ubuntu/results/ontology/c2id.json",'r')
35+
c2id=json.load(f)
36+
f.close()
37+
prefix='http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#'
38+
return c2id,prefix
39+
40+
c2id,prefix=load_sups()
41+
42+
43+
# In[26]:
44+
45+
46+
f=open("/home/ubuntu/results/ontology/full_word_list.json",'r')
47+
word_list=json.load(f)[1:]
48+
f.close()
49+
50+
51+
# In[27]:
52+
53+
54+
def get_emb(_code):
55+
e_vec=list(e2v_model.wv[_code])
56+
n_vec=n2v_model[str(c2id[prefix+_code])]
57+
return e_vec+n_vec
58+
59+
def get_embe(_code):
60+
e_vec=list(e2v_model.wv[_code])
61+
# n_vec=n2v_model[str(c2id[prefix+_code])]
62+
return e_vec
63+
64+
def get_embn(_code):
65+
# e_vec=list(e2v_model.wv[_code])
66+
n_vec=n2v_model[str(c2id[prefix+_code])]
67+
return n_vec
68+
69+
70+
# In[28]:
71+
72+
73+
from scipy.spatial.distance import cosine
74+
75+
76+
# In[ ]:
77+
78+
f=open("/home/ubuntu/results/logs/emb_evaluation_sg.txt","w")
79+
80+
avg_pos_syn0=0.0
81+
avg_pos_syn1=0.0
82+
for i,syn in enumerate(coded_syns):
83+
res_dict_syn0={}
84+
res_dict_syn1={}
85+
emb_syn0=np.array(get_embe(syn[0]))
86+
emb_syn1=np.array(get_embe(syn[1]))
87+
for w in word_list:
88+
_w=w.split('#')[1]
89+
emb_w=np.array(get_embe(_w))
90+
dist_syn0=cosine(emb_syn0,emb_w)
91+
dist_syn1=cosine(emb_syn1,emb_w)
92+
res_dict_syn0[_w]=dist_syn0
93+
res_dict_syn1[_w]=dist_syn1
94+
cur_pos_syn0=sorted(res_dict_syn0,key=res_dict_syn0.get).index(syn[1])
95+
cur_pos_syn1=sorted(res_dict_syn1,key=res_dict_syn1.get).index(syn[0])
96+
f.write("%s,%d,%s,%d\n"%(syn[0],cur_pos_syn0,syn[1],cur_pos_syn1))
97+
avg_pos_syn0+=cur_pos_syn0
98+
avg_pos_syn1+=cur_pos_syn1
99+
avg_pos_syn0/=len(coded_syns)
100+
avg_pos_syn1/=len(coded_syns)
101+
102+
103+
f.write("%f,%f\n"%(avg_pos_syn0,avg_pos_syn1))
104+
105+
106+
# avg_pos_syn0=0.0
107+
# avg_pos_syn1=0.0
108+
# for i,syn in enumerate(coded_syns):
109+
# res_dict_syn0={}
110+
# res_dict_syn1={}
111+
# emb_syn0=np.array(get_embn(syn[0]))
112+
# emb_syn1=np.array(get_embn(syn[1]))
113+
# for w in word_list:
114+
# _w=w.split('#')[1]
115+
# emb_w=np.array(get_embn(_w))
116+
# dist_syn0=cosine(emb_syn0,emb_w)
117+
# dist_syn1=cosine(emb_syn1,emb_w)
118+
# res_dict_syn0[_w]=dist_syn0
119+
# res_dict_syn1[_w]=dist_syn1
120+
# cur_pos_syn0=sorted(res_dict_syn0,key=res_dict_syn0.get).index(syn[1])
121+
# cur_pos_syn1=sorted(res_dict_syn1,key=res_dict_syn1.get).index(syn[0])
122+
# f.write("%s,%d,%s,%d\n"%(syn[0],cur_pos_syn0,syn[1],cur_pos_syn1))
123+
# avg_pos_syn0+=cur_pos_syn0
124+
# avg_pos_syn1+=cur_pos_syn1
125+
# avg_pos_syn0/=len(coded_syns)
126+
# avg_pos_syn1/=len(coded_syns)
127+
128+
129+
# f.write("%f,%f\n"%(avg_pos_syn0,avg_pos_syn1))
130+
131+
132+
# avg_pos_syn0=0.0
133+
# avg_pos_syn1=0.0
134+
# for i,syn in enumerate(coded_syns):
135+
# res_dict_syn0={}
136+
# res_dict_syn1={}
137+
# emb_syn0=np.array(get_emb(syn[0]))
138+
# emb_syn1=np.array(get_emb(syn[1]))
139+
# for w in word_list:
140+
# _w=w.split('#')[1]
141+
# emb_w=np.array(get_emb(_w))
142+
# dist_syn0=cosine(emb_syn0,emb_w)
143+
# dist_syn1=cosine(emb_syn1,emb_w)
144+
# res_dict_syn0[_w]=dist_syn0
145+
# res_dict_syn1[_w]=dist_syn1
146+
# cur_pos_syn0=sorted(res_dict_syn0,key=res_dict_syn0.get).index(syn[1])
147+
# cur_pos_syn1=sorted(res_dict_syn1,key=res_dict_syn1.get).index(syn[0])
148+
# f.write("%s,%d,%s,%d\n"%(syn[0],cur_pos_syn0,syn[1],cur_pos_syn1))
149+
# avg_pos_syn0+=cur_pos_syn0
150+
# avg_pos_syn1+=cur_pos_syn1
151+
# avg_pos_syn0/=len(coded_syns)
152+
# avg_pos_syn1/=len(coded_syns)
153+
154+
155+
# f.write("%f,%f\n"%(avg_pos_syn0,avg_pos_syn1))
156+
f.close()
157+

failed_attempts/RNN/gpu_score.py

+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import os
2+
import json
3+
import numpy as np
4+
from scipy.spatial.distance import cosine
5+
from keras.models import Sequential, load_model
6+
from keras.layers import LSTM,Bidirectional,Masking,BatchNormalization
7+
from keras.callbacks import EarlyStopping
8+
from gensim.models import word2vec as w2v
9+
10+
dim=128
11+
maxlen=512
12+
volume=1000
13+
homedir=os.environ['HOME']
14+
15+
def load_models():
16+
path=homedir+"/results/models/e2v_sg_10000_e100_d64.model"
17+
e2v_model=w2v.Word2Vec.load(path)
18+
f=open(homedir+"/results/ontology/KG_n2v_d64.json",'r')
19+
n2v_model=json.load(f)
20+
f.close()
21+
return e2v_model,n2v_model
22+
23+
e2v_model,n2v_model=load_models()
24+
25+
def load_sups():
26+
f=open(homedir+"/results/ontology/c2id.json",'r')
27+
c2id=json.load(f)
28+
f.close()
29+
f=open(homedir+"/results/ontology/full_word_list.json",'r')
30+
word_list=json.load(f)
31+
f.close()
32+
prefix='http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#'
33+
return c2id,prefix,word_list[1:]
34+
35+
c2id,prefix,word_list=load_sups()
36+
37+
def get_emb(_code):
38+
e_vec=list(e2v_model.wv[_code])
39+
n_vec=n2v_model[str(c2id[prefix+_code])]
40+
return e_vec+n_vec
41+
42+
def load_corpus(_path):
43+
f=open(_path,'r')
44+
pre_corpus=f.read()
45+
f.close()
46+
pre_list=pre_corpus.split("\n")[:-1]
47+
corpus=[]
48+
for i,p in enumerate(pre_list):
49+
_p=p.split(" ")[:-1]
50+
corpus.append(_p)
51+
return corpus[volume:volume+200]
52+
53+
path=homedir+"/thesiswork/source/corpus/fullcorpusall.txt"
54+
corpus=load_corpus(path)
55+
56+
def find_match(vec,num):
57+
min_dis=np.inf
58+
min_word=None
59+
for w in word_list:
60+
dis=cosine(vec,get_emb(w.split('#')[1]))
61+
if dis<min_dis:
62+
min_dis=dis
63+
min_word=w.split('#')[1]
64+
return min_word,min_dis
65+
66+
def test_on_data(_corpus,_maxlen,_model):
67+
i=0
68+
comp_vec=[0.0 for i in range(0,128)]
69+
ndata=[]
70+
hit=0.0
71+
while(i<len(_corpus)-1):
72+
_body=_corpus[i]
73+
i+=1
74+
_rbody=set(_corpus[i])
75+
b_emb=[]
76+
if len(_body)<_maxlen:
77+
for w in _body:
78+
b_emb.append(get_emb(w))
79+
for j in range(len(b_emb),_maxlen):
80+
b_emb.append(comp_vec)
81+
ndata=np.array([b_emb])
82+
y_out=model.predict(ndata)
83+
match,dis=find_match(y_out[0],1)
84+
if match in _rbody:
85+
hit+=1.0
86+
else:
87+
all_match=[]
88+
for j in range(0,len(_body)-_maxlen+1):
89+
b_emb=[]
90+
for wj in range(0,_maxlen):
91+
w=_body[j+wj]
92+
b_emb.append(get_emb(w))
93+
ndata=[b_emb]
94+
y_out=model.predict(ndata)
95+
match,dis=find_match(y_out[0],1)
96+
all_match.append((match,dis))
97+
best_match=min(all_match,key=lambda x:x[1])[0]
98+
if best_match in _rbody:
99+
hit+=1.0
100+
i+=1
101+
hit/=len(_corpus)/2.0
102+
return hit
103+
104+
mod_no=140
105+
logf=open(homedir+"/results/logs/BiLSTMGPU_log.txt",'a')
106+
while mod_no<=170:
107+
model=load_model(homedir+"/results/models/BiLSTMGPU"+str(mod_no)+".h5")
108+
hit=test_on_data(corpus,maxlen,model)
109+
logf.write("%d,%.3f\n"%(mod_no,hit))
110+
mod_no+=10
111+
logf.close()

0 commit comments

Comments
 (0)