Skip to content

Commit 2d767b7

Browse files
authored
Added DVC with remote storage (#5)
* Implemented DVC pipeline and added remote storage * Re-attained perfect pylint score
1 parent 346c4e4 commit 2d767b7

19 files changed

+319
-45
lines changed

.dvc/.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
/config.local
2+
/tmp
3+
/cache

.dvc/config

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
['remote "gdrive_remote"']
2+
url = gdrive://1NY8yEl6N1ZhE-q9jnEt6G6cqIHyEiCKc

.dvcignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Add patterns of files dvc should ignore, which could improve
2+
# the performance. Learn more at
3+
# https://dvc.org/doc/user-guide/dvcignore

.pylintrc

+1-1
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / stateme
193193
# Set the output format. Available formats are text, parseable, colorized, json
194194
# and msvs (visual studio). You can also give a reporter class, e.g.
195195
# mypackage.mymodule.MyReporterClass.
196-
output-format=text:data/reports/report.txt,colorized
196+
output-format=text:reports/pylint_report.txt,colorized
197197

198198
# Tells whether to display a full report or only the messages.
199199
reports=y

data/external/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.tsv

data/models/.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
c1_BoW_Sentiment_Model.pkl
2+
c2_Classifier_Sentiment_Model
-38.9 KB
Binary file not shown.
-45 KB
Binary file not shown.

data/processed/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.joblib

dvc.lock

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
schema: '2.0'
2+
stages:
3+
preprocessing:
4+
cmd: python src/preprocessing.py
5+
deps:
6+
- path: data/external/a1_RestaurantReviews_HistoricDump.tsv
7+
md5: 102f1f4193e0bdebdd6cce7f13e0a839
8+
size: 54686
9+
- path: src/preprocessing.py
10+
md5: b45d76ab50b20ccabfb50d591ee7ef02
11+
size: 2034
12+
outs:
13+
- path: data/processed/corpus.joblib
14+
md5: 243212bb05cce5e3fdc72bfd2826d329
15+
size: 31612
16+
load_data:
17+
cmd: python src/load_data.py
18+
deps:
19+
- path: src/load_data.py
20+
md5: e579b1f5296f89c5f22d8ac4af92e1c0
21+
size: 913
22+
outs:
23+
- path: data/external/a1_RestaurantReviews_HistoricDump.tsv
24+
md5: 102f1f4193e0bdebdd6cce7f13e0a839
25+
size: 54686
26+
training:
27+
cmd: python src/training.py
28+
deps:
29+
- path: data/external/a1_RestaurantReviews_HistoricDump.tsv
30+
md5: 102f1f4193e0bdebdd6cce7f13e0a839
31+
size: 54686
32+
- path: data/processed/corpus.joblib
33+
md5: 243212bb05cce5e3fdc72bfd2826d329
34+
size: 31612
35+
- path: src/evaluation.py
36+
md5: 96c08113733680243cbc537a93cc128d
37+
size: 396
38+
- path: src/training.py
39+
md5: 81ddde09ae93959e83afb4bae0ddd90a
40+
size: 2073
41+
outs:
42+
- path: data/models/c1_BoW_Sentiment_Model.pkl
43+
md5: 47e4584e52d616cbb5af92f988648e27
44+
size: 39823
45+
- path: data/models/c2_Classifier_Sentiment_Model
46+
md5: e6e6744062a1d370a585d15df7f45934
47+
size: 46127
48+
- path: reports/model_evaluation.txt
49+
md5: 35b131f5c189995225c586a8ae7025d9
50+
size: 67

dvc.yaml

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
stages:
2+
load_data:
3+
cmd: python src/load_data.py
4+
deps:
5+
- src/load_data.py
6+
outs:
7+
- data/external/a1_RestaurantReviews_HistoricDump.tsv
8+
preprocessing:
9+
cmd: python src/preprocessing.py
10+
deps:
11+
- src/preprocessing.py
12+
- data/external/a1_RestaurantReviews_HistoricDump.tsv
13+
outs:
14+
- data/processed/corpus.joblib
15+
training:
16+
cmd: python src/training.py
17+
deps:
18+
- src/training.py
19+
- src/evaluation.py
20+
- data/external/a1_RestaurantReviews_HistoricDump.tsv
21+
- data/processed/corpus.joblib
22+
outs:
23+
- data/models/c1_BoW_Sentiment_Model.pkl
24+
- data/models/c2_Classifier_Sentiment_Model
25+
- reports/model_evaluation.txt

reports/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/model_evaluation.txt

reports/pylint_report.txt

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
2+
3+
Report
4+
======
5+
109 statements analysed.
6+
7+
Statistics by type
8+
------------------
9+
10+
+---------+-------+-----------+-----------+------------+---------+
11+
|type |number |old number |difference |%documented |%badname |
12+
+=========+=======+===========+===========+============+=========+
13+
|module |7 |7 |= |100.00 |0.00 |
14+
+---------+-------+-----------+-----------+------------+---------+
15+
|class |1 |1 |= |100.00 |0.00 |
16+
+---------+-------+-----------+-----------+------------+---------+
17+
|method |3 |3 |= |100.00 |0.00 |
18+
+---------+-------+-----------+-----------+------------+---------+
19+
|function |3 |3 |= |100.00 |0.00 |
20+
+---------+-------+-----------+-----------+------------+---------+
21+
22+
23+
24+
External dependencies
25+
---------------------
26+
::
27+
28+
joblib (src.main,src.preprocessing,src.training)
29+
nltk (src.preprocessing)
30+
\-corpus (src.preprocessing)
31+
\-stem
32+
\-porter (src.preprocessing)
33+
pandas (src.preprocessing,src.training)
34+
sklearn
35+
\-feature_extraction
36+
| \-text (src.training)
37+
\-metrics (src.evaluation)
38+
\-model_selection (src.classification,src.training)
39+
\-naive_bayes (src.classification,src.training)
40+
41+
42+
43+
Raw metrics
44+
-----------
45+
46+
+----------+-------+------+---------+-----------+
47+
|type |number |% |previous |difference |
48+
+==========+=======+======+=========+===========+
49+
|code |122 |49.19 |122 |= |
50+
+----------+-------+------+---------+-----------+
51+
|docstring |32 |12.90 |32 |= |
52+
+----------+-------+------+---------+-----------+
53+
|comment |37 |14.92 |37 |= |
54+
+----------+-------+------+---------+-----------+
55+
|empty |57 |22.98 |57 |= |
56+
+----------+-------+------+---------+-----------+
57+
58+
59+
60+
Duplication
61+
-----------
62+
63+
+-------------------------+------+---------+-----------+
64+
| |now |previous |difference |
65+
+=========================+======+=========+===========+
66+
|nb duplicated lines |0 |0 |0 |
67+
+-------------------------+------+---------+-----------+
68+
|percent duplicated lines |0.000 |0.000 |= |
69+
+-------------------------+------+---------+-----------+
70+
71+
72+
73+
Messages by category
74+
--------------------
75+
76+
+-----------+-------+---------+-----------+
77+
|type |number |previous |difference |
78+
+===========+=======+=========+===========+
79+
|convention |0 |1 |1 |
80+
+-----------+-------+---------+-----------+
81+
|refactor |0 |0 |0 |
82+
+-----------+-------+---------+-----------+
83+
|warning |0 |0 |0 |
84+
+-----------+-------+---------+-----------+
85+
|error |0 |0 |0 |
86+
+-----------+-------+---------+-----------+
87+
88+
89+
90+
Messages
91+
--------
92+
93+
+-----------+------------+
94+
|message id |occurrences |
95+
+===========+============+
96+
97+
98+
99+
100+
-------------------------------------------------------------------
101+
Your code has been rated at 10.00/10 (previous run: 9.91/10, +0.09)
102+

requirements.txt

+5-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,8 @@ joblib==1.1.1
22
nltk==3.7
33
scikit_learn==1.2.2
44
setuptools==45.2.0
5-
5+
dvc==2.58.1
6+
dvc_gdrive==2.19.2
7+
pylint==2.12.2
8+
mllint==0.12.2
9+
dslinter==2.0.9

setup.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,6 @@
44

55
from setuptools import setup, find_packages
66

7-
requirements = [ ]
8-
9-
test_requirements = [ ]
10-
117
setup(
128
author="Team 08",
139
python_requires='>=3.6',
@@ -21,15 +17,13 @@
2117
'Programming Language :: Python :: 3.7',
2218
'Programming Language :: Python :: 3.8',
2319
],
24-
description="The model-training repository of Team 08 for the Release Engineering for Machine Learning (CS4295) course at the TU Delft.",
25-
install_requires=requirements,
20+
description="The model-training repository of Team 08 for the CS4295 course at the TU Delft.",
2621
license="MIT license",
2722
include_package_data=True,
2823
keywords='model_training',
2924
name='model_training',
3025
packages=find_packages(include=['model_training', 'model_training.*']),
3126
test_suite='tests',
32-
tests_require=test_requirements,
3327
url='https://github.com/remla23-team08/model-training',
3428
version='0.2.0',
3529
zip_safe=False,

src/evaluation.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
#! /usr/bin/env
22

33
"""
4-
Functions related to model evaluation
4+
Evaluate the model and return results
55
"""
66

77
from sklearn.metrics import confusion_matrix, accuracy_score
88

99

1010
def model_eval(classifier, X_test, y_test):
1111
"""
12-
Prints model evaluation metrics
12+
Returns model evaluation metrics
1313
"""
1414
y_pred = classifier.predict(X_test)
1515
conf_matrix = confusion_matrix(y_test, y_pred)
16-
print(conf_matrix, accuracy_score(y_test, y_pred))
16+
acc_score = accuracy_score(y_test, y_pred)
17+
return conf_matrix, acc_score

src/load_data.py

+23-6
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,30 @@
1-
#! /usr/bin/env
1+
# #! /usr/bin/env
22

33
"""
44
This script loads data from the dataset_path into a pandas dataset.
55
"""
66

7-
import pandas as pd
7+
import os
8+
import urllib.request
9+
import zipfile
810

9-
def load_data(dataset_path):
10-
"""Function loading data from dataset_path into pandas dataset"""
11-
dataset = pd.read_csv(dataset_path, delimiter = '\t', quoting = 3, dtype={'Review': object, 'Liked': int})[:]
1211

13-
return dataset
12+
if __name__ == "__main__":
13+
# Specify the relative path to data tsv
14+
root_path = os.path.dirname(os.path.abspath(__file__))
15+
dataset_path = os.path.join(root_path, '..', 'data', 'external', 'a1_RestaurantReviews_HistoricDump.tsv')
16+
17+
# Import the data from external source
18+
print("Importing external dataset..")
19+
URL = r'https://drive.google.com/uc?export=download&id=1G7rLkSloPUzkK4zCzb9lLR0zSYygu8mK'
20+
zip_path, _ = urllib.request.urlretrieve(URL)
21+
22+
# Define export path for dataset
23+
export_path = os.path.dirname(os.path.abspath(dataset_path))
24+
25+
# Unzip at export path
26+
with zipfile.ZipFile(zip_path, "r") as f:
27+
f.extractall(export_path)
28+
29+
# Print success to console
30+
print("External dataset sucessfully imported!")

src/preprocessing.py

+40-27
Original file line numberDiff line numberDiff line change
@@ -5,43 +5,56 @@
55
"""
66

77
import os
8-
import pickle
98
import re
109
import nltk
1110
from nltk.corpus import stopwords
1211
from nltk.stem.porter import PorterStemmer
13-
from sklearn.feature_extraction.text import CountVectorizer
12+
import pandas as pd
13+
import joblib
1414

15-
def data_preprocessing(dataset):
16-
"""
17-
Main preprocessing steps for ML data
18-
"""
19-
nltk.download('stopwords')
20-
porter_stem = PorterStemmer()
2115

22-
all_stopwords = stopwords.words('english')
23-
all_stopwords.remove('not')
16+
class Preprocessing:
17+
"""Class to easily preprocess datasets"""
2418

25-
corpus=[]
26-
for i in range(0, len(dataset)):
27-
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
19+
def __init__(self):
20+
"""Initialize preprocess class"""
21+
nltk.download('stopwords')
22+
self.porter_stem = PorterStemmer()
23+
self.all_stopwords = stopwords.words('english')
24+
self.all_stopwords.remove('not')
25+
26+
self.dataset = None
27+
self.count_vectorizer = None
28+
29+
def preprocess_dataset(self, dataset):
30+
"""Loop over entire dataset to preprocess"""
31+
corpus = []
32+
for i in range(0, len(dataset)):
33+
corpus.append(self.preprocess_review(dataset['Review'][i]))
34+
return corpus
35+
36+
def preprocess_review(self, review):
37+
"""Processing a single review"""
38+
review = re.sub('[^a-zA-Z]', ' ', review)
2839
review = review.lower()
2940
review = review.split()
30-
review = [porter_stem.stem(word) for word in review if not word in set(all_stopwords)]
41+
review = [self.porter_stem.stem(word) for word in review if not word in set(self.all_stopwords)]
3142
review = ' '.join(review)
32-
corpus.append(review)
43+
return review
3344

34-
# Use count vectoriser to transform dataset
35-
count_vectoriser = CountVectorizer(max_features = 1420)
36-
X = count_vectoriser.fit_transform(corpus).toarray()
37-
y = dataset.iloc[:, -1].values
3845

39-
# Get the root path of the current script, and bow path to save dictionary later
46+
if __name__ == "__main__":
47+
# Specify the relative path to data tsv
4048
root_path = os.path.dirname(os.path.abspath(__file__))
41-
bow_path = os.path.join(root_path, '..', 'data', 'models', 'c1_BoW_Sentiment_Model.pkl')
42-
43-
# Saving BoW dictionary to later use in prediction
44-
with open(bow_path, "wb") as file:
45-
pickle.dump(count_vectoriser, file)
46-
47-
return X, y
49+
dataset_path = os.path.join(root_path, '..', 'data', 'external', 'a1_RestaurantReviews_HistoricDump.tsv')
50+
51+
# Load data from file
52+
load_dataset = pd.read_csv(dataset_path, delimiter = '\t', quoting = 3, dtype={'Review': object, 'Liked': int})[:]
53+
54+
# Preprocess and store processed corpus in joblib
55+
print("Preprocessing the dataset...")
56+
preprocess_class = Preprocessing()
57+
save_corpus = preprocess_class.preprocess_dataset(load_dataset)
58+
corpus_path = os.path.join(root_path, '..', 'data/processed/corpus.joblib')
59+
joblib.dump(save_corpus, corpus_path)
60+
print(f"Processed dataset (corpus) is saved to: {corpus_path}")

0 commit comments

Comments
 (0)