Skip to content

Commit

Permalink
Benchmarks to track performance changes in 'hist' method
Browse files Browse the repository at this point in the history
  • Loading branch information
SmirnovEgorRu authored and hcho3 committed Dec 19, 2019
1 parent 14f325e commit c787a59
Show file tree
Hide file tree
Showing 4 changed files with 375 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,5 @@
*.exe
*.out
*.app

/data
39 changes: 39 additions & 0 deletions hist_method/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
## How to run the benchmarks:
1. Obtain python package of XGBoost. There are a few options:
- Build XGBoost from sources manually:
```sh
git clone --recursive https://github.com/dmlc/xgboost
cd xgboost
make -j8
cd python-package
python setup.py install
cd ..
```
- Or download the latest available version from pip:
```sh
pip install xgboost
```
- More details are available [here](https://xgboost.readthedocs.io/en/latest/build.html)

2. Resolve dependencies on other python packages. For now it has dependencies on further packages: requests, scikit-learn, pandas, numpy. You can easily download them through pip:
```sh
pip install requests scikit-learn pandas
```
3. Run benchmarks with specified parameters:
```sh
cd tests/benchmark/hist_method
python xgboost_bench.py --dataset <dataset> \
--hw <platform> \
--n_iter <n_iter> \
--n_runs <n_runs> \
--log <enable_log>
```

The benchmark downloads required datasets from the Internet automatically, you don't need to worry about it.
## Available parameters:
* **dataset** - dataset to use in benchmark. Possible values: *"higgs1m", "airline-ohe", "msrank-10k"* [Required].
* **platform** - specify platform for computation. Possible values: *cpu, gpu*. [Default=cpu].
* **n_iter** - amount of boosting iterations. Possible values: *integer > 0*. [Default=1000].
* **n_runs** - number of training and prediction measurements to obtain stable performance results. Possible values: *integer > 0*. [Default=5].
* **enable_log** - if False - no additional debug info ("silent"=1). If True ("verbosity"=3) it prints execution time by kernels. Possible values: *True, False*. [Default=False].
220 changes: 220 additions & 0 deletions hist_method/bench_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#*******************************************************************************
# Copyright 2017-2019 by Contributors
# \file bench_utils.py
# \brief utills for a benchmark for 'hist' tree_method on both CPU/GPU arhitectures
# \author Egor Smirnov
#*******************************************************************************

import os
import re
import bz2
import sys
import timeit
import tarfile
import requests
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss

if sys.version_info[0] >= 3:
from urllib.request import urlretrieve # pylint: disable=import-error,no-name-in-module
else:
from urllib import urlretrieve # pylint: disable=import-error,no-name-in-module

DATASET_DIR="./data/"


def measure(func, string, nrepeat):
t = timeit.Timer(stmt="%s()" % func.__name__, setup="from __main__ import %s" % func.__name__)
res = t.repeat(repeat=nrepeat, number=1)

def box_filter(timing, left=0.25, right=0.75): # statistically remove outliers and compute average
timing.sort()
size = len(timing)
if size == 1:
return timing[0]

Q1, Q2 = timing[int(size * left)], timing[int(size * right)]

IQ = Q2 - Q1

lower = Q1 - 1.5 * IQ
upper = Q2 + 1.5 * IQ

result = np.array([item for item in timing if lower < item < upper])
return np.mean(result)

timing = box_filter(res)
print((string + " = {:.4f} sec (").format(timing), res, ")")


def compute_logloss(y1, y2):
return log_loss(y1.ravel(), y2)


def download_file(url):
local_filename = DATASET_DIR + url.split('/')[-1]
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=2**20):
if chunk:
f.write(chunk)
return local_filename


def load_higgs(nrows_train, nrows_test, dtype):
"""
Higgs dataset from UCI machine learning repository (
https://archive.ics.uci.edu/ml/datasets/HIGGS).
TaskType:binclass
NumberOfFeatures:28
NumberOfInstances:11M
"""
if not os.path.isfile(DATASET_DIR + "HIGGS.csv.gz"):
print("Loading data set...")
download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz")

print("Reading data set...")
data = pd.read_csv(DATASET_DIR + "HIGGS.csv.gz", delimiter=",", header=None, compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test)
print("Pre-processing data set...")

data = data[list(data.columns[1:])+list(data.columns[0:1])]
n_features = data.shape[1]-1
train_data = np.ascontiguousarray(data.values[:nrows_train,:n_features], dtype=dtype)
train_label = np.ascontiguousarray(data.values[:nrows_train,n_features], dtype=dtype)
test_data = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,:n_features], dtype=dtype)
test_label = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,n_features], dtype=dtype)
n_classes = len(np.unique(train_label))
return train_data, train_label, test_data, test_label, n_classes


def load_higgs1m(dtype):
return load_higgs(1000000, 500000, dtype)


def read_libsvm_msrank(file_obj, n_samples, n_features, dtype):
X = np.zeros((n_samples, n_features))
y = np.zeros((n_samples,))

counter = 0

regexp = re.compile(r'[A-Za-z0-9]+:(-?\d*\.?\d+)')

for line in file_obj:
line = str(line).replace("\\n'", "")
line = regexp.sub('\g<1>', line)
line = line.rstrip(" \n\r").split(' ')

y[counter] = int(line[0])
X[counter] = [float(i) for i in line[1:]]

counter += 1
if counter == n_samples:
break

return np.array(X, dtype=dtype), np.array(y, dtype=dtype)


def _make_gen(reader):
b = reader(1024 * 1024)
while b:
yield b
b = reader(1024 * 1024)


def _count_lines(filename):
with open(filename, 'rb') as f:
f_gen = _make_gen(f.read)
return sum(buf.count(b'\n') for buf in f_gen)

def load_msrank_10k(dtype):
"""
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
TaskType:binclass
NumberOfFeatures:700
NumberOfInstances:10100000
"""

url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz"
tar = DATASET_DIR + "msrank.tar.gz"

if not os.path.isfile(tar):
print("Loading data set...")
download_file(url)

if not os.path.isfile(DATASET_DIR + "MSRank/train.txt"):
tar = tarfile.open(tar, "r:gz")
tar.extractall(DATASET_DIR)
tar.close()

sets = []
labels = []
n_features = 137

print("Reading data set...")
for set_name in ['train.txt', 'vali.txt', 'test.txt']:
file_name = DATASET_DIR + os.path.join('MSRank', set_name)

n_samples = _count_lines(file_name)
with open(file_name, 'r') as file_obj:
X, y = read_libsvm_msrank(file_obj, n_samples, n_features, dtype)

sets.append(X)
labels.append(y)

sets[0] = np.vstack((sets[0], sets[1]))
labels[0] = np.hstack((labels[0], labels[1]))

sets = [ np.ascontiguousarray(sets[i]) for i in [0, 2]]
labels = [ np.ascontiguousarray(labels[i]) for i in [0, 2]]

n_classes = len(np.unique(labels[0]))

return sets[0], labels[0], sets[1], labels[1], n_classes


def load_airline_one_hot(dtype):
"""
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
TaskType:binclass
NumberOfFeatures:700
NumberOfInstances:10100000
"""
url = 'https://s3.amazonaws.com/benchm-ml--main/'

name_train = 'train-10m.csv'
name_test = 'test.csv'

sets = []
labels = []

categorical_names = ["Month", "DayofMonth", "DayOfWeek", "UniqueCarrier", "Origin", "Dest"]
categorical_ids = [0, 1, 2, 4, 5, 6]

numeric_names = ["DepTime", "Distance"]
numeric_ids = [3, 7]

for name in [name_train, name_test]:
filename = os.path.join(DATASET_DIR, name)
if not os.path.exists(filename):
print("Loading", filename)
urlretrieve(url + name, filename)

print("Reading", filename)
df = pd.read_csv(filename, nrows=1000000) if name == 'train-10m.csv' else pd.read_csv(filename)
X = df.drop('dep_delayed_15min', 1)
y = df["dep_delayed_15min"]

y_num = np.where(y == "Y", 1, 0)

sets.append(X)
labels.append(y_num)

n_samples_train = sets[0].shape[0]

X = pd.concat(sets)
X = pd.get_dummies(X, columns=categorical_names)
sets = [X[:n_samples_train], X[n_samples_train:]]

return sets[0], labels[0], sets[1], labels[1], 2
114 changes: 114 additions & 0 deletions hist_method/xgboost_hist_method_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#*******************************************************************************
# Copyright 2017-2019 by Contributors
# \file xgboost_hist_method_bench.py
# \brief a benchmark for 'hist' tree_method on both CPU/GPU arhitectures
# \author Egor Smirnov
#*******************************************************************************

import argparse
import xgboost as xgb
from bench_utils import *

N_PERF_RUNS = 5
DTYPE=np.float32

xgb_params = {
'alpha': 0.9,
'max_bin': 256,
'scale_pos_weight': 2,
'learning_rate': 0.1,
'subsample': 1,
'reg_lambda': 1,
"min_child_weight": 0,
'max_depth': 8,
'max_leaves': 2**8,
}

def xbg_fit():
global model_xgb
dtrain = xgb.DMatrix(x_train, label=y_train)
model_xgb = xgb.train(xgb_params, dtrain, xgb_params['n_estimators'])

def xgb_predict_of_train_data():
global result_predict_xgb_train
dtest = xgb.DMatrix(x_train)
result_predict_xgb_train = model_xgb.predict(dtest)

def xgb_predict_of_test_data():
global result_predict_xgb_test
dtest = xgb.DMatrix(x_test)
result_predict_xgb_test = model_xgb.predict(dtest)


def load_dataset(dataset):
global x_train, y_train, x_test, y_test

try:
os.mkdir(DATASET_DIR)
except:
pass

datasets_dict = {
'higgs1m': load_higgs1m,
'msrank-10k': load_msrank_10k,
'airline-ohe':load_airline_one_hot
}

x_train, y_train, x_test, y_test, n_classes = datasets_dict[dataset](DTYPE)

if n_classes == -1:
xgb_params['objective'] = 'reg:squarederror'
elif n_classes == 2:
xgb_params['objective'] = 'binary:logistic'
else:
xgb_params['objective'] = 'multi:softprob'
xgb_params['num_class'] = n_classes

def parse_args():
global N_PERF_RUNS
parser = argparse.ArgumentParser()
parser.add_argument('--n_iter', required=False, type=int, default=1000)
parser.add_argument('--n_runs', default=N_PERF_RUNS, required=False, type=int)
parser.add_argument('--hw', choices=['cpu', 'gpu'], metavar='stage', required=False, default='cpu')
parser.add_argument('--log', metavar='stage', required=False, type=bool, default=False)
parser.add_argument('--dataset', choices=['higgs1m', "airline-ohe", "msrank-10k"],
metavar='stage', required=True)

args = parser.parse_args()
N_PERF_RUNS = args.n_runs

xgb_params['n_estimators'] = args.n_iter

if args.log:
xgb_params['verbosity'] = 3
else:
xgb_params['silent'] = 1

if args.hw == "cpu":
xgb_params['tree_method'] = 'hist'
xgb_params['predictor'] = 'cpu_predictor'
elif args.hw == "gpu":
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['predictor'] = 'gpu_predictor'

load_dataset(args.dataset)


def main():
parse_args()

print("Running ...")
measure(xbg_fit, "XGBOOST training ", N_PERF_RUNS)
measure(xgb_predict_of_train_data, "XGBOOST predict (train data)", N_PERF_RUNS)
measure(xgb_predict_of_test_data, "XGBOOST predict (test data) ", N_PERF_RUNS)

print("Compute quality metrics...")

train_loglos = compute_logloss(y_train, result_predict_xgb_train)
test_loglos = compute_logloss(y_test, result_predict_xgb_test)

print("LogLoss for train data set = {:.6f}".format(train_loglos))
print("LogLoss for test data set = {:.6f}".format(test_loglos))

if __name__ == '__main__':
main()

0 comments on commit c787a59

Please sign in to comment.