example/gluon/house_prices/kaggle_k_fold_cross_validation.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


# This example provides an end-to-end pipeline for a common Kaggle competition.
# The entire pipeline includes common utilities such as k-fold cross validation
# and data pre-processing.
#
# Specifically, the example studies the `House Prices: Advanced Regression
# Techniques` challenge as a case study.
#
# The link to the problem on Kaggle:
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques

import numpy as onp
import pandas as pd
from mxnet import autograd
from mxnet import gluon
from mxnet import np

# After logging in www.kaggle.com, the training and testing data sets can be downloaded at:
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/train.csv
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/test.csv
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
all_X = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
                      test.loc[:, 'MSSubClass':'SaleCondition']))

# Get all the numerical features and apply standardization.
numeric_feas = all_X.dtypes[all_X.dtypes != "object"].index
all_X[numeric_feas] = all_X[numeric_feas].apply(lambda x:
                                                (x - x.mean()) / (x.std()))
# Convert categorical feature values to numerical (including N/A).
all_X = pd.get_dummies(all_X, dummy_na=True)
# Approximate N/A feature value by the mean value of the current feature.
all_X = all_X.fillna(all_X.mean())

num_train = train.shape[0]

# Convert data formats to NDArrays to feed into gluon.
X_train = all_X[:num_train].as_matrix()
X_test = all_X[num_train:].as_matrix()
y_train = train.SalePrice.as_matrix()

X_train = np.array(X_train)
y_train = np.array(y_train)
y_train.reshape((num_train, 1))

X_test = np.array(X_test)
square_loss = gluon.loss.L2Loss()

def get_rmse_log(net, X_train, y_train):
    """Gets root mse between the logarithms of the prediction and the truth."""
    num_train = X_train.shape[0]
    clipped_preds = np.clip(net(X_train), 1, float('inf'))
    return np.sqrt(2 * np.sum(square_loss(
        np.log(clipped_preds), np.log(y_train))).item() / num_train)

def get_net():
    """Gets a neural network. Better results are obtained with modifications."""
    net = gluon.nn.Sequential()
    net.add(gluon.nn.Dense(50, activation="relu"))
    net.add(gluon.nn.Dense(1))
    net.initialize()
    return net

def train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
          weight_decay, batch_size):
    """Trains the model."""
    dataset_train = gluon.data.ArrayDataset(X_train, y_train)
    data_iter_train = gluon.data.DataLoader(dataset_train, batch_size,
                                            shuffle=True)
    trainer = gluon.Trainer(net.collect_params(), 'adam',
                            {'learning_rate': learning_rate,
                             'wd': weight_decay})
    net.initialize(force_reinit=True)
    for epoch in range(epochs):
        for data, label in data_iter_train:
            with autograd.record():
                output = net(data)
                loss = square_loss(output, label)
            loss.backward()
            trainer.step(batch_size)
            avg_loss = get_rmse_log(net, X_train, y_train)
        if epoch > verbose_epoch:
            print(f"Epoch {epoch}, train loss: {avg_loss}")
    return avg_loss

def k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
                       learning_rate, weight_decay, batch_size):
    """Conducts k-fold cross validation for the model."""
    assert k > 1
    fold_size = X_train.shape[0] // k

    train_loss_sum = 0.0
    test_loss_sum = 0.0
    for test_idx in range(k):
        X_val_test = X_train[test_idx * fold_size: (test_idx + 1) *
                                                   fold_size, :]
        y_val_test = y_train[test_idx * fold_size: (test_idx + 1) * fold_size]
        val_train_defined = False
        for i in range(k):
            if i != test_idx:
                X_cur_fold = X_train[i * fold_size: (i + 1) * fold_size, :]
                y_cur_fold = y_train[i * fold_size: (i + 1) * fold_size]
                if not val_train_defined:
                    X_val_train = X_cur_fold
                    y_val_train = y_cur_fold
                    val_train_defined = True
                else:
                    X_val_train = np.concatenate([X_val_train, X_cur_fold], axis=0)
                    y_val_train = np.concatenate([y_val_train, y_cur_fold], axis=0)
        net = get_net()
        train_loss = train(net, X_val_train, y_val_train, epochs, verbose_epoch,
                           learning_rate, weight_decay, batch_size)
        train_loss_sum += train_loss
        test_loss = get_rmse_log(net, X_val_test, y_val_test)
        print(f"Test loss: {test_loss}")
        test_loss_sum += test_loss
    return train_loss_sum / k, test_loss_sum / k

# The sets of parameters. Better results are obtained with modifications.
# These parameters can be fine-tuned with k-fold cross-validation.
k = 5
epochs = 100
verbose_epoch = 95
learning_rate = 0.3
weight_decay = 100
batch_size = 100

train_loss, test_loss = \
    k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
                       learning_rate, weight_decay, batch_size)
print(f"{k}-fold validation: Avg train loss: {train_loss}, Avg test loss: {test_loss}")

def learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
          weight_decay, batch_size):
    """Trains the model and predicts on the test data set."""
    net = get_net()
    _ = train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
                 weight_decay, batch_size)
    preds = net(X_test).asnumpy()
    test['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test['Id'], test['SalePrice']], axis=1)
    submission.to_csv('submission.csv', index=False)

learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
      weight_decay, batch_size)