This repository has been archived by the owner on Nov 17, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6.8k
/
kaggle_k_fold_cross_validation.py
162 lines (144 loc) · 6.64 KB
/
kaggle_k_fold_cross_validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# This example provides an end-to-end pipeline for a common Kaggle competition.
# The entire pipeline includes common utilities such as k-fold cross validation
# and data pre-processing.
#
# Specifically, the example studies the `House Prices: Advanced Regression
# Techniques` challenge as a case study.
#
# The link to the problem on Kaggle:
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques
import numpy as onp
import pandas as pd
from mxnet import autograd
from mxnet import gluon
from mxnet import np
# After logging in www.kaggle.com, the training and testing data sets can be downloaded at:
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/train.csv
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/test.csv
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
all_X = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
test.loc[:, 'MSSubClass':'SaleCondition']))
# Get all the numerical features and apply standardization.
numeric_feas = all_X.dtypes[all_X.dtypes != "object"].index
all_X[numeric_feas] = all_X[numeric_feas].apply(lambda x:
(x - x.mean()) / (x.std()))
# Convert categorical feature values to numerical (including N/A).
all_X = pd.get_dummies(all_X, dummy_na=True)
# Approximate N/A feature value by the mean value of the current feature.
all_X = all_X.fillna(all_X.mean())
num_train = train.shape[0]
# Convert data formats to NDArrays to feed into gluon.
X_train = all_X[:num_train].as_matrix()
X_test = all_X[num_train:].as_matrix()
y_train = train.SalePrice.as_matrix()
X_train = np.array(X_train)
y_train = np.array(y_train)
y_train.reshape((num_train, 1))
X_test = np.array(X_test)
square_loss = gluon.loss.L2Loss()
def get_rmse_log(net, X_train, y_train):
"""Gets root mse between the logarithms of the prediction and the truth."""
num_train = X_train.shape[0]
clipped_preds = np.clip(net(X_train), 1, float('inf'))
return np.sqrt(2 * np.sum(square_loss(
np.log(clipped_preds), np.log(y_train))).item() / num_train)
def get_net():
"""Gets a neural network. Better results are obtained with modifications."""
net = gluon.nn.Sequential()
net.add(gluon.nn.Dense(50, activation="relu"))
net.add(gluon.nn.Dense(1))
net.initialize()
return net
def train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
weight_decay, batch_size):
"""Trains the model."""
dataset_train = gluon.data.ArrayDataset(X_train, y_train)
data_iter_train = gluon.data.DataLoader(dataset_train, batch_size,
shuffle=True)
trainer = gluon.Trainer(net.collect_params(), 'adam',
{'learning_rate': learning_rate,
'wd': weight_decay})
net.initialize(force_reinit=True)
for epoch in range(epochs):
for data, label in data_iter_train:
with autograd.record():
output = net(data)
loss = square_loss(output, label)
loss.backward()
trainer.step(batch_size)
avg_loss = get_rmse_log(net, X_train, y_train)
if epoch > verbose_epoch:
print(f"Epoch {epoch}, train loss: {avg_loss}")
return avg_loss
def k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
learning_rate, weight_decay, batch_size):
"""Conducts k-fold cross validation for the model."""
assert k > 1
fold_size = X_train.shape[0] // k
train_loss_sum = 0.0
test_loss_sum = 0.0
for test_idx in range(k):
X_val_test = X_train[test_idx * fold_size: (test_idx + 1) *
fold_size, :]
y_val_test = y_train[test_idx * fold_size: (test_idx + 1) * fold_size]
val_train_defined = False
for i in range(k):
if i != test_idx:
X_cur_fold = X_train[i * fold_size: (i + 1) * fold_size, :]
y_cur_fold = y_train[i * fold_size: (i + 1) * fold_size]
if not val_train_defined:
X_val_train = X_cur_fold
y_val_train = y_cur_fold
val_train_defined = True
else:
X_val_train = np.concatenate([X_val_train, X_cur_fold], axis=0)
y_val_train = np.concatenate([y_val_train, y_cur_fold], axis=0)
net = get_net()
train_loss = train(net, X_val_train, y_val_train, epochs, verbose_epoch,
learning_rate, weight_decay, batch_size)
train_loss_sum += train_loss
test_loss = get_rmse_log(net, X_val_test, y_val_test)
print(f"Test loss: {test_loss}")
test_loss_sum += test_loss
return train_loss_sum / k, test_loss_sum / k
# The sets of parameters. Better results are obtained with modifications.
# These parameters can be fine-tuned with k-fold cross-validation.
k = 5
epochs = 100
verbose_epoch = 95
learning_rate = 0.3
weight_decay = 100
batch_size = 100
train_loss, test_loss = \
k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
learning_rate, weight_decay, batch_size)
print(f"{k}-fold validation: Avg train loss: {train_loss}, Avg test loss: {test_loss}")
def learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
weight_decay, batch_size):
"""Trains the model and predicts on the test data set."""
net = get_net()
_ = train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
weight_decay, batch_size)
preds = net(X_test).asnumpy()
test['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test['Id'], test['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)
learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
weight_decay, batch_size)