-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_bayes.py
131 lines (97 loc) · 3.67 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
'''
Source: Natural Language Processing with Classification and Vector Spaces, Week 2, Coursera Assignment
Description: Naive Bayes Classifier
'''
import re
import numpy as np
def lookup(freqs, word, label):
'''
Input:
freqs: dictionary from (word, label) to how often the word appears
word: the word to look up
label: the label corresponding to the word
Output:
n: the number of times the word with its corresponding label appears
'''
n = 0 # initialize the count to 0
# retrieve count from the freqs dictionary for the (word, label) pair
if (word, label) in freqs:
n = freqs[(word, label)]
return n
def train_naive_bayes(freqs, train_x, train_y):
'''
Input:
freqs: dictionary from (word, label) to how often the word appears
train_x: a list of tweets
train_y: a list of labels corresponding to the tweets (0,1)
Output:
logprior: the log prior. (equation 3 above)
loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
'''
loglikelihood = {}
logprior = 0
### START CODE HERE ###
# calculate V, the number of unique words in the vocabulary
vocab = set([pair[0] for pair in freqs.keys()])
V = len(vocab)
# calculate N_pos, N_neg, V_pos, V_neg
N_pos = N_neg = 0
for pair in freqs.keys():
# if the label is positive (greater than zero)
if pair[1] > 0:
# Increment the number of positive words by the count for this (word, label) pair
N_pos += freqs[pair]
# else, the label is negative
else:
# increment the number of negative words by the count for this (word,label) pair
N_neg += freqs[pair]
# Calculate D, the number of documents
D = len(train_y)
# Calculate D_pos, the number of positive documents
D_pos = (len(list(filter(lambda x: x > 0, train_y))))
# Calculate D_neg, the number of negative documents
D_neg = (len(list(filter(lambda x: x <= 0, train_y))))
# Calculate logprior
logprior = np.log(D_pos) - np.log(D_neg)
# For each word in the vocabulary...
for word in vocab:
# get the positive and negative frequency of the word
freq_pos = lookup(freqs,word,1)
freq_neg = lookup(freqs,word,0)
# calculate the probability that each word is positive, and negative
p_w_pos = (freq_pos + 1) / (N_pos + V)
p_w_neg = (freq_neg + 1) / (N_neg + V)
# calculate the log likelihood of the word
loglikelihood[word] = np.log(p_w_pos/p_w_neg)
### END CODE HERE ###
return logprior, loglikelihood
def process_tweet(tweet):
# remove old style retweet text "RT"
tweet = re.sub(r'^RT[\s]+', '', tweet)
# remove hyperlinks
tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
# remove hashtags
# only removing the hash # sign from the word
tweet = re.sub(r'#', '', tweet)
return tweet
def naive_bayes_predict(tweet, logprior, loglikelihood):
'''
Input:
tweet: a string
logprior: a number
loglikelihood: a dictionary of words mapping to numbers
Output:
p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)
'''
# process the tweet to get a list of words
word_l = process_tweet(tweet)
# initialize probability to zero
p = 0
# add the logprior
p += logprior
for word in word_l:
# check if the word exists in the loglikelihood dictionary
if word in loglikelihood:
# add the log likelihood of that word to the probability
p += loglikelihood[word]
return p