-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentence_sim.py
141 lines (110 loc) · 4.44 KB
/
sentence_sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import warnings
warnings.filterwarnings('ignore') # 警告扰人,手动封存
import gc
import tqdm
import numpy as np
from gensim import corpora, models, similarities
from collections import defaultdict
from sentence import Sentence
class SentenceSimilarity():
def __init__(self, seg):
self.seg = seg
def set_sentences(self, sentences):
self.sentences = []
for i in range(0, len(sentences)):
self.sentences.append(Sentence(sentences[i], self.seg, i))
# 获取切过词的句子
def get_cuted_sentences(self):
cuted_sentences = []
for sentence in self.sentences:
cuted_sentences.append(sentence.get_cuted_sentence())
return cuted_sentences
# 构建其他复杂模型前需要的简单模型
def simple_model(self, min_frequency=1):
self.texts = self.get_cuted_sentences()
# 删除低频词
frequency = defaultdict(int)
for text in self.texts:
for token in text:
frequency[token] += 1
# 过滤掉频率小于等于min_frequency的词语
self.texts = [[token for token in text if frequency[token] > min_frequency] for text in self.texts]
# 用于生成词典
self.dictionary = corpora.Dictionary(self.texts)
#将词语和文档id进行映射
self.corpus_simple = [self.dictionary.doc2bow(text) for text in self.texts]
# tfidf模型
def TfidfModel(self):
self.simple_model()
# 转换模型
self.model = models.TfidfModel(self.corpus_simple, dictionary=self.dictionary)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# self.index = similarities.Similarity(self.model, self.corpus, len(self.dictionary))
# lsi模型
def LsiModel(self):
self.simple_model()
# 转换模型
self.model = models.LsiModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# lda模型
def LdaModel(self):
self.simple_model()
# 转换模型
self.model = models.LdaModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# 对新输入的句子(比较的句子)进行预处理
def sentence2vec(self, sentence):
sentence = Sentence(sentence, self.seg)
vec_bow = self.dictionary.doc2bow(sentence.get_cuted_sentence())
return self.model[vec_bow]
def bow2vec(self):
vec = []
length = max(self.dictionary) + 1
for content in self.corpus:
sentence_vectors = np.zeros(length)
for co in content:
sentence_vectors[co[0]] = co[1] # 将句子出现的单词的tf-idf表示放入矩阵中
vec.append(sentence_vectors)
return vec
# 求最相似的句子
# input: test sentence
def similarity(self, sentence):
sentence_vec = self.sentence2vec(sentence)
sims = self.index[sentence_vec]
sim = max(enumerate(sims), key=lambda item: item[1])
index = sim[0]
score = sim[1]
sentence = self.sentences[index]
sentence.set_score(score)
return sentence # 返回一个类
# 求最相似的句子
def similarity_k(self, sentence, k):
sentence_vec = self.sentence2vec(sentence)
print("sentence_vec:", sentence_vec)
sims = self.index[sentence_vec]
print("sims:", sims)
sim_k = sorted(enumerate(sims), key=lambda item: item[1], reverse=True)[:k]
indexs = [i[0] for i in sim_k]
scores = [i[1] for i in sim_k]
return indexs, scores
if __name__ == '__main__':
from jieba_utils import Seg
seg = Seg()
sentenceSimilarity = SentenceSimilarity(seg)
from data_utils import get_qa_data
path = './WebQA.v1.0/me_train.json'
sentences = get_qa_data(path)
sentences = [i[0] for i in sentences]
sentenceSimilarity.set_sentences(sentences)
sentenceSimilarity.simple_model()
sentenceSimilarity.TfidfModel()
sentence = "世界最高的山峰"
indexs, scores = sentenceSimilarity.similarity_k(sentence, 5)
for ind,score in zip(indexs, scores):
print(sentenceSimilarity.sentences[ind].origin_sentence, score)