-
Notifications
You must be signed in to change notification settings - Fork 1
/
2_Categorical_Data_Cleaning.py
50 lines (37 loc) · 2.01 KB
/
2_Categorical_Data_Cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)
y = df.age
X=df.drop(columns=['age'])
#numerical cols
num_cols = X.select_dtypes(include=np.number).columns
#categorical cols
cat_cols = X.select_dtypes(include=['object']).columns
#create some missing values
for i in range(1000):
X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)
x_train_cat = x_train[cat_cols]
#Fill missing values with mode on categorical features only: fill with 'M'
x_train_fill_missing = x_train_cat.fillna(x_train_cat.mode().values[0][0]) #2d arr -> [row][col]
#Apply one hot encoding on x_train_fill_missing
ohe = OneHotEncoder(sparse=False, drop='first').fit(x_train_fill_missing) #every element is displayed -> 0 1
#Transform data after filling in missing values
x_train_fill_missing_ohe = ohe.transform(x_train_fill_missing)
#now we want to do the same thing on the test set!
x_test_fill_missing = x_test[cat_cols].fillna(x_train_cat.mode().values[0][0])
x_test_fill_missing_ohe = ohe.transform(x_test_fill_missing)
#1. Rewrite using Pipelines!
pipeline=Pipeline([("imputer", SimpleImputer(strategy='most_frequent')),("one_hot_encode", OneHotEncoder(drop = 'first',sparse = False))])
#2. Fit the pipeline and transform the test data (categorical columns only!)
pipeline.fit(x_train[cat_cols])
x_transform=pipeline.transform(x_test[cat_cols])
#3. Check if the two arrays are the same using np.array_equal()
check_arrays= np.array_equal(x_transform,x_test_fill_missing_ohe)
print(f'pipeline_arr == np_arr : {check_arrays}')