-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathmain.py
66 lines (53 loc) · 2.06 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import jieba
import jieba.posseg as pseg
import os
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def titlelist():
for file in os.listdir('.'):
if '.' not in file:
for f in os.listdir(file):
yield (file+'--'+f.split('.')[0]) # windows下编码问题添加:.decode('gbk', 'ignore').encode('utf-8'))
def wordslist():
jieba.add_word(u'丹妮莉丝')
stop_word = [unicode(line.rstrip()) for line in open('chinese_stopword.txt')]
print len(stop_word)
for file in os.listdir('.'):
if '.' not in file:
for f in os.listdir(file):
with open(file + '//' + f) as t:
content = t.read().strip().replace('\n', '').replace(' ', '').replace('\t', '').replace('\r', '')
seg_list = pseg.cut(content)
seg_list_after = []
# 去停用词
for seg in seg_list:
if seg.word not in stop_word:
seg_list_after.append(seg.word)
result = ' '.join(seg_list_after)
# wordslist.append(result)
yield result
if __name__ == "__main__":
wordslist = list(wordslist())
titlelist = list(titlelist())
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(wordslist))
words = vectorizer.get_feature_names() #所有文本的关键字
weight = tfidf.toarray()
print 'ssss'
n = 5 # 前五位
for (title, w) in zip(titlelist, weight):
print u'{}:'.format(title)
# 排序
loc = np.argsort(-w)
for i in range(n):
print u'-{}: {} {}'.format(str(i + 1), words[loc[i]], w[loc[i]])
print '\n'