-
Notifications
You must be signed in to change notification settings - Fork 1
/
bbc.py
executable file
·131 lines (115 loc) · 3.48 KB
/
bbc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python3
import codecs
import glob
import os
import signal
import string
import sys
import npr
keep_running = True
def handler(signum, frame):
print('Signal handler called with signal', signum)
global keep_running
if signum == signal.SIGINT:
keep_running = False
class Text(object):
exclude = set(string.punctuation)
exclude.add(u'\u2013') # endash
exclude.add(u'\u2014') # emdash
excludeStr = ''.join(exclude)
@staticmethod
def stripPunctuation(text):
return text.strip(Text.excludeStr)
@staticmethod
def extractWords(text):
return [Text.stripPunctuation(w.strip()) for w in text.split()]
@staticmethod
def rawText(paragraphs):
words = []
for paragraph in paragraphs:
newSentence = True
for word in paragraph.split():
word.strip()
if newSentence:
if len(word) > 1:
word = word.lower()
newSentence = False
newSentence = word.endswith('.')
words.append(Text.stripPunctuation(word))
return ' '.join(words)
class Story(object):
def __init__(self, category):
self.category = category
self.paragraphs = []
self.title = ''
def hasText(self):
return len(self.paragraphs) > 0
def rawText(self):
return Text.rawText(self.paragraphs)
class BBC(object):
def __init__(self):
pass
@staticmethod
def readStory(file_name):
category = os.path.basename(os.path.dirname(file_name))
story = Story(category)
sentence_num = 0
with codecs.open(file_name, 'r', 'iso-8859-1') as f:
for line in f.readlines():
line = line.strip()
if not line:
continue
sentence_num += 1
if sentence_num == 1:
story.title = line
else:
story.paragraphs.append(line)
return story
@staticmethod
def readStories(dir):
# Read the BBC story corpus downloaded
# from http://mlg.ucd.ie/datasets/bbc.html
stories = []
file_names = glob.glob('%s/*/*.txt' % dir)
for file_name in file_names:
stories.append(BBC.readStory(file_name))
return stories
@staticmethod
def classifyStories(stories):
print("Predicting stories...")
story_texts = []
stories_being_classified = []
for story in stories:
if story.category == 'sport':
continue
if story.hasText():
stories_being_classified.append(story)
story_texts.append(story.rawText())
if not story_texts:
print('No story text to classify')
return
print('Predicting for', len(story_texts), 'non-sport stories')
(clp, lb) = npr.NPR.createClassifierPipeline(18, apply_auto_tags=True)
predicted = clp.predict(story_texts)
new_labels = {}
all_labels = lb.inverse_transform(predicted)
num_labeled_stories = 0
for story, categories in zip(stories_being_classified, all_labels):
if len(categories):
num_labeled_stories += 1
for category in categories:
if category in new_labels:
new_labels[category] += 1
else:
new_labels[category] = 1
print('%s => %s' % (story.title, ', '.join(categories) ))
print()
print('Labeled', num_labeled_stories, 'stories')
print('Newly added tags: %d' % len(new_labels))
print('====================')
for category in sorted(new_labels.keys()):
print('%s: %d' % (category, new_labels[category]))
if __name__ == '__main__':
stories = BBC.readStories('bbc')
print('Loaded', len(stories), 'stories')
BBC.classifyStories(stories)