yanwii
diff --git a/‎.vscode/settings.json
+3 b/‎.vscode/settings.json
+3
diff --git a/‎LICENSE
100644100755 b/‎LICENSE
100644100755
diff --git a/‎README.md
100644100755
+32-2 b/‎README.md
100644100755
+32-2
diff --git a/‎data/answer.txt
+1 b/‎data/answer.txt
+1
diff --git a/‎data/dec.segement
+1 b/‎data/dec.segement
+1
diff --git a/‎data/dec.vec
+3 b/‎data/dec.vec
+3
diff --git a/‎data/dec.vocab
+7 b/‎data/dec.vocab
+7
diff --git a/‎data/enc.segement
+1 b/‎data/enc.segement
+1
diff --git a/‎data/enc.vec
+3 b/‎data/enc.vec
+3
diff --git a/‎data/enc.vocab
+7 b/‎data/enc.vocab
+7
diff --git a/‎data/question.txt
+1 b/‎data/question.txt
+1
diff --git a/‎data/supplementvocab.txt b/‎data/supplementvocab.txt
diff --git a/‎img/1.png
17.7 KB b/‎img/1.png
17.7 KB
diff --git a/‎img/beamsearch.png
91.7 KB b/‎img/beamsearch.png
91.7 KB
diff --git a/‎img/beamsearch2.jpeg
74.2 KB b/‎img/beamsearch2.jpeg
74.2 KB
diff --git a/‎model/params.pkl
2.68 MB b/‎model/params.pkl
2.68 MB
diff --git a/‎preprocessing.py
+87 b/‎preprocessing.py
+87
@@ -0,0 +1,3 @@
+{
+    "python.linting.pylintEnabled": false
+}
@@ -4,7 +4,37 @@
 于是转投Pytorch， 从此打开了新世界的大门。
 
 ---
-需要   
-**Python3** **Pytorch** **Jieba**  
+Requirements:   
+[**Python3**](https://www.python.org/)  
+[**Pytorch**](https://github.com/pytorch/pytorch)   
+[**Jieba分词**](https://github.com/fxsjy/jieba)
 
 ---
+
+### 关于BeamSearch算法
+很经典的贪心算法，在很多领域都有应用。
+
+![](./img/beamsearch.png)
+
+
+在这个引用中 我们引入了惩罚因子
+![](./img/beamsearch2.jpeg)
+
+
+![](./img/1.png)
+
+
+---
+
+### 用法  
+
+        # 准备数据
+        python3 preprocessing.py
+        # 训练
+        python3 seq2seq.py train
+        # 预测
+        python3 seq2seq.py predict
+        # 重新训练
+        python3 seq2seq.py retrain
+
+
@@ -0,0 +1 @@
+我是你
@@ -0,0 +1 @@
+我 是 你 
@@ -0,0 +1,3 @@
+3
+3
+4 5 6 
@@ -0,0 +1,7 @@
+__PAD__
+__GO__
+__EOS__
+__UNK__
+我
+是
+你
@@ -0,0 +1 @@
+你 是 谁 
@@ -0,0 +1,3 @@
+3 3 3 3
+3
+4 5 6 
@@ -0,0 +1,7 @@
+__PAD__
+__GO__
+__EOS__
+__UNK__
+你
+是
+谁
@@ -0,0 +1 @@
+你是谁
@@ -0,0 +1,87 @@
+import jieba
+import re
+
+class preprocessing():
+    __PAD__ = 0
+    __GO__ = 1
+    __EOS__ = 2
+    __UNK__ = 3
+    vocab = ['__PAD__', '__GO__', '__EOS__', '__UNK__']
+    def __init__(self):
+        #self.encoderFile = "/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_ask.txt"
+        #self.decoderFile = '/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_answer.txt'
+        #self.savePath = '/home/yanwii/Python/NLP/seq2seq/seq2seq_pytorch/data/'
+        self.encoderFile = "./data/question.txt"
+        self.decoderFile = "./data/answer.txt"
+        self.savePath = './data/'
+        
+        jieba.load_userdict("./data/supplementvocab.txt")
+    
+    def wordToVocabulary(self, originFile, vocabFile, segementFile):
+        vocabulary = []
+        sege = open(segementFile, "w")
+        with open(originFile, 'r') as en:
+            for sent in en.readlines():
+                # 去标点
+                if "enc" in segementFile:
+                    #sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。“”’‘？?、~@#￥%……&*（）]+", "", sent.strip())
+                    sentence = sent.strip()
+                    words = jieba.lcut(sentence)
+                    print(words)
+                else:
+                    words = jieba.lcut(sent.strip())
+                vocabulary.extend(words)
+                for word in words:
+                    sege.write(word+" ")
+                sege.write("\n")
+        sege.close()
+
+        # 去重并存入词典
+        vocab_file = open(vocabFile, "w")
+        _vocabulary = list(set(vocabulary))
+        _vocabulary.sort(key=vocabulary.index)
+        _vocabulary = self.vocab + _vocabulary
+        for index, word in enumerate(_vocabulary):
+            vocab_file.write(word+"\n")
+        vocab_file.close()
+
+    def toVec(self, segementFile, vocabFile, doneFile):
+        word_dicts = {}
+        vec = []
+        with open(vocabFile, "r") as dict_f:
+            for index, word in enumerate(dict_f.readlines()):
+                word_dicts[word.strip()] = index
+
+        f = open(doneFile, "w")
+        if "enc.vec" in doneFile:
+            f.write("3 3 3 3\n")
+            f.write("3\n")
+        elif "dec.vec" in doneFile:
+            f.write(str(word_dicts.get("other", 3))+"\n")
+            f.write(str(word_dicts.get("other", 3))+"\n")
+        with open(segementFile, "r") as sege_f:
+            for sent in sege_f.readlines():
+                sents = [i.strip() for i in sent.split(" ")[:-1]]
+                vec.extend(sents)
+                for word in sents:
+                    f.write(str(word_dicts.get(word))+" ")
+                f.write("\n")
+        f.close()
+            
+
+    def main(self):
+        # 获得字典
+        self.wordToVocabulary(self.encoderFile, self.savePath+'enc.vocab', self.savePath+'enc.segement')
+        self.wordToVocabulary(self.decoderFile, self.savePath+'dec.vocab', self.savePath+'dec.segement')
+        # 转向量
+        self.toVec(self.savePath+"enc.segement", 
+                   self.savePath+"enc.vocab", 
+                   self.savePath+"enc.vec")
+        self.toVec(self.savePath+"dec.segement", 
+                   self.savePath+"dec.vocab", 
+                   self.savePath+"dec.vec")
+
+
+if __name__ == '__main__':
+    pre = preprocessing()
+    pre.main()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "python.linting.pylintEnabled": false`
	`3`	`+}`
-Original file line number
+Diff line change
 +__PAD__
 +__GO__
 +__EOS__
 +__UNK__
 +我
 +是
 +你