-
Notifications
You must be signed in to change notification settings - Fork 1
/
1_Numeric_Data_Cleaning.py
67 lines (50 loc) · 2.61 KB
/
1_Numeric_Data_Cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
## Loading the dataset
columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)
## Defining target and predictor variables
y = df.age #target
X = df.drop(columns=['age'])
## Numerical columns:
num_cols = X.select_dtypes(include=np.number).columns
## Categorical columns
cat_cols = X.select_dtypes(include=['object']).columns
## Create some missing values randomly in dataset:
## test how well different data handling strategies e.g. imputation methods perform in presence of missing values.
for i in range(1000):
X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan
## Perform train-test split
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)
#perform imputation (replace missing values) -> standard scaler
## Numerical training data
x_train_num = x_train[num_cols]
# Filling in missing values with mean on numeric features only
x_train_fill_missing = x_train_num.fillna(x_train_num.mean())
## Fitting standard scaler on x_train_fill_missing
scale = StandardScaler().fit(x_train_fill_missing)
## Scaling data after filling in missing values
x_train_fill_missing_scale = scale.transform(x_train_fill_missing)
## same steps as above, but on the test set:
x_test_fill_missing = x_test[num_cols].fillna(x_train_num.mean())
x_test_fill_missing_scale = scale.transform(x_test_fill_missing)
#####-------Imputation and Scaling: Code base to transform -----------------#####
#1. Using Pipelines!
pipeline=Pipeline([("imputer", SimpleImputer(strategy='mean')),("scale", StandardScaler())])
#2. Fit pipeline on the test and compare results
pipeline.fit(x_train[num_cols])
x_transform=pipeline.transform(x_test[num_cols])
#3. Verify pipeline transform test set is the same by using np.array_equal()
array_diff=np.array_equal(x_transform,x_test_fill_missing_scale)
print(f'pipeline_arr == np_arr: {array_diff}')
#4. Change imputer strategy to median
pipeline_median=Pipeline([('imputer', SimpleImputer(strategy='median')),('scale',StandardScaler())])
#5. Compare results between the two pipelines
pipeline_median.fit(x_train[num_cols])
x_transform_median=pipeline_median.transform(x_test[num_cols])
new_array_diff=abs(x_transform-x_transform_median).sum()
print(f'pipeline_arr_med - np_arr_med = {new_array_diff}')