-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNN(own data).py
95 lines (77 loc) · 2.29 KB
/
NN(own data).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
import numpy as np
import pandas as pd
lemmatizer = WordNetLemmatizer()
'''
polarity 0 = negative. 2 = neutral. 4 = positive.
id
date
query
user
tweet
'''
def init_process(fin,fout):
outfile = open(fout,'a')
with open(fin, buffering=200000, encoding='latin-1') as f:
try:
for line in f:
line = line.replace('"','')
initial_polarity = line.split(',')[0]
if initial_polarity == '0':
initial_polarity = [1,0]
elif initial_polarity == '4':
initial_polarity = [0,1]
tweet = line.split(',')[-1]
outline = str(initial_polarity)+':::'+tweet
outfile.write(outline)
except Exception as e:
print(str(e))
outfile.close()
init_process('training.1600000.processed.noemoticon.csv','train_set.csv')
init_process('testdata.manual.2009.06.14.csv','test_set.csv')
def create_lexicon(fin):
lexicon = []
with open(fin, 'r', buffering=100000, encoding='latin-1') as f:
try:
counter = 1
content = ''
for line in f:
counter += 1
if (counter/2500.0).is_integer():
tweet = line.split(':::')[1]
content += ' '+tweet
words = word_tokenize(content)
words = [lemmatizer.lemmatize(i) for i in words]
lexicon = list(set(lexicon + words))
print(counter, len(lexicon))
except Exception as e:
print(str(e))
with open('lexicon-2500-2638.pickle','wb') as f:
pickle.dump(lexicon,f)
create_lexicon('train_set.csv')
def convert_to_vec(fin,fout,lexicon_pickle):
with open(lexicon_pickle,'rb') as f:
lexicon = pickle.load(f)
outfile = open(fout,'a')
with open(fin, buffering=20000, encoding='latin-1') as f:
counter = 0
for line in f:
counter +=1
label = line.split(':::')[0]
tweet = line.split(':::')[1]
current_words = word_tokenize(tweet.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon))
for word in current_words:
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
# OR DO +=1, test both
features[index_value] += 1
features = list(features)
outline = str(features)+'::'+str(label)+'\n'
outfile.write(outline)
print(counter)
convert_to_vec('test_set.csv','processed-test-set.csv','lexicon-2500-2638.pickle')