|
5 | 5 | """
|
6 | 6 |
|
7 | 7 | import os
|
8 |
| -import pickle |
9 | 8 | import re
|
10 | 9 | import nltk
|
11 | 10 | from nltk.corpus import stopwords
|
12 | 11 | from nltk.stem.porter import PorterStemmer
|
13 |
| -from sklearn.feature_extraction.text import CountVectorizer |
| 12 | +import pandas as pd |
| 13 | +import joblib |
14 | 14 |
|
15 |
| -def data_preprocessing(dataset): |
16 |
| - """ |
17 |
| - Main preprocessing steps for ML data |
18 |
| - """ |
19 |
| - nltk.download('stopwords') |
20 |
| - porter_stem = PorterStemmer() |
21 | 15 |
|
22 |
| - all_stopwords = stopwords.words('english') |
23 |
| - all_stopwords.remove('not') |
| 16 | +class Preprocessing: |
| 17 | + """Class to easily preprocess datasets""" |
24 | 18 |
|
25 |
| - corpus=[] |
26 |
| - for i in range(0, len(dataset)): |
27 |
| - review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) |
| 19 | + def __init__(self): |
| 20 | + """Initialize preprocess class""" |
| 21 | + nltk.download('stopwords') |
| 22 | + self.porter_stem = PorterStemmer() |
| 23 | + self.all_stopwords = stopwords.words('english') |
| 24 | + self.all_stopwords.remove('not') |
| 25 | + |
| 26 | + self.dataset = None |
| 27 | + self.count_vectorizer = None |
| 28 | + |
| 29 | + def preprocess_dataset(self, dataset): |
| 30 | + """Loop over entire dataset to preprocess""" |
| 31 | + corpus = [] |
| 32 | + for i in range(0, len(dataset)): |
| 33 | + corpus.append(self.preprocess_review(dataset['Review'][i])) |
| 34 | + return corpus |
| 35 | + |
| 36 | + def preprocess_review(self, review): |
| 37 | + """Processing a single review""" |
| 38 | + review = re.sub('[^a-zA-Z]', ' ', review) |
28 | 39 | review = review.lower()
|
29 | 40 | review = review.split()
|
30 |
| - review = [porter_stem.stem(word) for word in review if not word in set(all_stopwords)] |
| 41 | + review = [self.porter_stem.stem(word) for word in review if not word in set(self.all_stopwords)] |
31 | 42 | review = ' '.join(review)
|
32 |
| - corpus.append(review) |
| 43 | + return review |
33 | 44 |
|
34 |
| - # Use count vectoriser to transform dataset |
35 |
| - count_vectoriser = CountVectorizer(max_features = 1420) |
36 |
| - X = count_vectoriser.fit_transform(corpus).toarray() |
37 |
| - y = dataset.iloc[:, -1].values |
38 | 45 |
|
39 |
| - # Get the root path of the current script, and bow path to save dictionary later |
| 46 | +if __name__ == "__main__": |
| 47 | + # Specify the relative path to data tsv |
40 | 48 | root_path = os.path.dirname(os.path.abspath(__file__))
|
41 |
| - bow_path = os.path.join(root_path, '..', 'data', 'models', 'c1_BoW_Sentiment_Model.pkl') |
42 |
| - |
43 |
| - # Saving BoW dictionary to later use in prediction |
44 |
| - with open(bow_path, "wb") as file: |
45 |
| - pickle.dump(count_vectoriser, file) |
46 |
| - |
47 |
| - return X, y |
| 49 | + dataset_path = os.path.join(root_path, '..', 'data', 'external', 'a1_RestaurantReviews_HistoricDump.tsv') |
| 50 | + |
| 51 | + # Load data from file |
| 52 | + load_dataset = pd.read_csv(dataset_path, delimiter = '\t', quoting = 3, dtype={'Review': object, 'Liked': int})[:] |
| 53 | + |
| 54 | + # Preprocess and store processed corpus in joblib |
| 55 | + print("Preprocessing the dataset...") |
| 56 | + preprocess_class = Preprocessing() |
| 57 | + save_corpus = preprocess_class.preprocess_dataset(load_dataset) |
| 58 | + corpus_path = os.path.join(root_path, '..', 'data/processed/corpus.joblib') |
| 59 | + joblib.dump(save_corpus, corpus_path) |
| 60 | + print(f"Processed dataset (corpus) is saved to: {corpus_path}") |
0 commit comments