-
Notifications
You must be signed in to change notification settings - Fork 29
/
match.py
95 lines (73 loc) · 2.76 KB
/
match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json
import os
import random
from responsesEvaluate import Evaluator
from Matcher.fuzzyMatcher import FuzzyMatcher
from Matcher.wordWeightMatcher import WordWeightMatcher
from Matcher.bm25Matcher import bestMatchingMatcher
from Matcher.matcher import Matcher
def main():
matcherTesting("bm25",removeStopWords=False)
def getMatcher(matcherType,removeStopWords=False):
"""
回傳初始完畢的 Matcher
Args:
- matcherType:要使用哪種字串匹配方式
- Fuzzy
- WordWeight
- sort:
- a boolean value for fuzzy sorting match.
"""
if matcherType == "WordWeight":
return woreWeightMatch()
elif matcherType == "Fuzzy":
return fuzzyMatch(removeStopWords)
elif matcherType == "bm25":
return bm25()
elif matcherType == "Vectorize":
pass #TODO
elif matcherType == "DeepLearning":
pass #TODO
else:
print("[Error]: Invailded type.")
exit()
def matcherTesting(matcherType,removeStopWords=False):
matcher = getMatcher(matcherType,removeStopWords)
while True:
query = input("隨便說些什麼吧: ")
title,index = matcher.match(query)
sim = matcher.getSimilarity()
print("最為相似的標題是 %s ,相似度為 %d " % (title,sim))
res = json.load(open(os.path.join("data/processed/reply/",str(int(index/1000))+'.json'),'r',encoding='utf-8'))
targetId = index % 1000
#randomId = random.randrange(0,len(res[targetId]))
evaluator = Evaluator()
candiates = evaluator.getBestResponse(responses=res[targetId],topk=5,debugMode=False)
print("以下是相似度前 5 高的回應")
for candiate in candiates:
print("%s %f" % (candiate[0],candiate[1]))
def woreWeightMatch():
weightMatcher = WordWeightMatcher(segLib="jieba")
weightMatcher.loadTitles(path="data/Titles.txt")
weightMatcher.initialize()
return weightMatcher
def fuzzyMatch(cleansw=False):
fuzzyMatcher = FuzzyMatcher(segLib="jieba",removeStopWords=cleansw)
fuzzyMatcher.loadTitles(path="data/Titles.txt")
if cleansw:
fuzzyMatcher.TitlesSegmentation(cleansw)
fuzzyMatcher.joinTitles()
return fuzzyMatcher
#load a custom user dictionary.
#fuzzyMatcher.TaibaCustomSetting(usr_dict="jieba_dictionary/ptt_dic.txt")
#load stopwords
#fuzzyMatcher.loadStopWords(path="data/stopwords/chinese_sw.txt")
#fuzzyMatcher.loadStopWords(path="data/stopwords/ptt_words.txt")
#fuzzyMatcher.loadStopWords(path="data/stopwords/specialMarks.txt")
def bm25():
bm25Matcher = bestMatchingMatcher()
bm25Matcher.loadTitles(path="data/Titles.txt")
bm25Matcher.initialize()
return bm25Matcher
if __name__ == '__main__':
main()