forked from shubham0204/Spam_Classification_TF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataProcessor.py
48 lines (37 loc) · 1.43 KB
/
DataProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import tensorflow as tf
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
dataframe = pd.read_csv( 'raw_data/data.csv' , encoding='iso8859' , usecols=[ 'v1' , 'v2' ] )
labels = list()
for line in dataframe.v1:
labels.append( 0 if line == 'ham' else 1 )
texts = list()
for line in dataframe.v2:
texts.append( line )
lengths = list()
for text in texts:
lengths.append( len( text.split() ) )
maxlen = max( lengths )
labels = np.array( labels )
texts = np.array( texts )
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( texts )
tokenized_messages = tokenizer.texts_to_sequences( texts )
padded_messages = tf.keras.preprocessing.sequence.pad_sequences( tokenized_messages , maxlen )
onehot_labels = tf.keras.utils.to_categorical( labels , num_classes=2 )
X = padded_messages
Y = onehot_labels
print( X.shape )
print( Y.shape )
print( 'MESSAGE MAXLEN = {}'.format( maxlen ) )
train_features , test_features ,train_labels, test_labels = train_test_split( X , Y , test_size=0.4 )
output_path = 'processed_data/'
np.save( '{}x.npy'.format( output_path ) , train_features )
np.save( '{}y.npy'.format( output_path ) , train_labels )
np.save( '{}test_x.npy'.format( output_path ) , test_features )
np.save( '{}test_y.npy'.format( output_path ) , test_labels )
with open( 'android/word_dict.json' , 'w' ) as file:
json.dump( tokenizer.word_index , file )
print( 'Data processed.')