-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Benchmarks to track performance changes in 'hist' method
- Loading branch information
1 parent
14f325e
commit c787a59
Showing
4 changed files
with
375 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,3 +30,5 @@ | |
*.exe | ||
*.out | ||
*.app | ||
|
||
/data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
## How to run the benchmarks: | ||
1. Obtain python package of XGBoost. There are a few options: | ||
- Build XGBoost from sources manually: | ||
```sh | ||
git clone --recursive https://github.com/dmlc/xgboost | ||
cd xgboost | ||
make -j8 | ||
cd python-package | ||
python setup.py install | ||
cd .. | ||
``` | ||
- Or download the latest available version from pip: | ||
```sh | ||
pip install xgboost | ||
``` | ||
- More details are available [here](https://xgboost.readthedocs.io/en/latest/build.html) | ||
|
||
2. Resolve dependencies on other python packages. For now it has dependencies on further packages: requests, scikit-learn, pandas, numpy. You can easily download them through pip: | ||
```sh | ||
pip install requests scikit-learn pandas | ||
``` | ||
3. Run benchmarks with specified parameters: | ||
```sh | ||
cd tests/benchmark/hist_method | ||
python xgboost_bench.py --dataset <dataset> \ | ||
--hw <platform> \ | ||
--n_iter <n_iter> \ | ||
--n_runs <n_runs> \ | ||
--log <enable_log> | ||
``` | ||
|
||
The benchmark downloads required datasets from the Internet automatically, you don't need to worry about it. | ||
## Available parameters: | ||
* **dataset** - dataset to use in benchmark. Possible values: *"higgs1m", "airline-ohe", "msrank-10k"* [Required]. | ||
* **platform** - specify platform for computation. Possible values: *cpu, gpu*. [Default=cpu]. | ||
* **n_iter** - amount of boosting iterations. Possible values: *integer > 0*. [Default=1000]. | ||
* **n_runs** - number of training and prediction measurements to obtain stable performance results. Possible values: *integer > 0*. [Default=5]. | ||
* **enable_log** - if False - no additional debug info ("silent"=1). If True ("verbosity"=3) it prints execution time by kernels. Possible values: *True, False*. [Default=False]. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,220 @@ | ||
#******************************************************************************* | ||
# Copyright 2017-2019 by Contributors | ||
# \file bench_utils.py | ||
# \brief utills for a benchmark for 'hist' tree_method on both CPU/GPU arhitectures | ||
# \author Egor Smirnov | ||
#******************************************************************************* | ||
|
||
import os | ||
import re | ||
import bz2 | ||
import sys | ||
import timeit | ||
import tarfile | ||
import requests | ||
import numpy as np | ||
import pandas as pd | ||
from sklearn.metrics import log_loss | ||
|
||
if sys.version_info[0] >= 3: | ||
from urllib.request import urlretrieve # pylint: disable=import-error,no-name-in-module | ||
else: | ||
from urllib import urlretrieve # pylint: disable=import-error,no-name-in-module | ||
|
||
DATASET_DIR="./data/" | ||
|
||
|
||
def measure(func, string, nrepeat): | ||
t = timeit.Timer(stmt="%s()" % func.__name__, setup="from __main__ import %s" % func.__name__) | ||
res = t.repeat(repeat=nrepeat, number=1) | ||
|
||
def box_filter(timing, left=0.25, right=0.75): # statistically remove outliers and compute average | ||
timing.sort() | ||
size = len(timing) | ||
if size == 1: | ||
return timing[0] | ||
|
||
Q1, Q2 = timing[int(size * left)], timing[int(size * right)] | ||
|
||
IQ = Q2 - Q1 | ||
|
||
lower = Q1 - 1.5 * IQ | ||
upper = Q2 + 1.5 * IQ | ||
|
||
result = np.array([item for item in timing if lower < item < upper]) | ||
return np.mean(result) | ||
|
||
timing = box_filter(res) | ||
print((string + " = {:.4f} sec (").format(timing), res, ")") | ||
|
||
|
||
def compute_logloss(y1, y2): | ||
return log_loss(y1.ravel(), y2) | ||
|
||
|
||
def download_file(url): | ||
local_filename = DATASET_DIR + url.split('/')[-1] | ||
with requests.get(url, stream=True) as r: | ||
r.raise_for_status() | ||
with open(local_filename, 'wb') as f: | ||
for chunk in r.iter_content(chunk_size=2**20): | ||
if chunk: | ||
f.write(chunk) | ||
return local_filename | ||
|
||
|
||
def load_higgs(nrows_train, nrows_test, dtype): | ||
""" | ||
Higgs dataset from UCI machine learning repository ( | ||
https://archive.ics.uci.edu/ml/datasets/HIGGS). | ||
TaskType:binclass | ||
NumberOfFeatures:28 | ||
NumberOfInstances:11M | ||
""" | ||
if not os.path.isfile(DATASET_DIR + "HIGGS.csv.gz"): | ||
print("Loading data set...") | ||
download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz") | ||
|
||
print("Reading data set...") | ||
data = pd.read_csv(DATASET_DIR + "HIGGS.csv.gz", delimiter=",", header=None, compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test) | ||
print("Pre-processing data set...") | ||
|
||
data = data[list(data.columns[1:])+list(data.columns[0:1])] | ||
n_features = data.shape[1]-1 | ||
train_data = np.ascontiguousarray(data.values[:nrows_train,:n_features], dtype=dtype) | ||
train_label = np.ascontiguousarray(data.values[:nrows_train,n_features], dtype=dtype) | ||
test_data = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,:n_features], dtype=dtype) | ||
test_label = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,n_features], dtype=dtype) | ||
n_classes = len(np.unique(train_label)) | ||
return train_data, train_label, test_data, test_label, n_classes | ||
|
||
|
||
def load_higgs1m(dtype): | ||
return load_higgs(1000000, 500000, dtype) | ||
|
||
|
||
def read_libsvm_msrank(file_obj, n_samples, n_features, dtype): | ||
X = np.zeros((n_samples, n_features)) | ||
y = np.zeros((n_samples,)) | ||
|
||
counter = 0 | ||
|
||
regexp = re.compile(r'[A-Za-z0-9]+:(-?\d*\.?\d+)') | ||
|
||
for line in file_obj: | ||
line = str(line).replace("\\n'", "") | ||
line = regexp.sub('\g<1>', line) | ||
line = line.rstrip(" \n\r").split(' ') | ||
|
||
y[counter] = int(line[0]) | ||
X[counter] = [float(i) for i in line[1:]] | ||
|
||
counter += 1 | ||
if counter == n_samples: | ||
break | ||
|
||
return np.array(X, dtype=dtype), np.array(y, dtype=dtype) | ||
|
||
|
||
def _make_gen(reader): | ||
b = reader(1024 * 1024) | ||
while b: | ||
yield b | ||
b = reader(1024 * 1024) | ||
|
||
|
||
def _count_lines(filename): | ||
with open(filename, 'rb') as f: | ||
f_gen = _make_gen(f.read) | ||
return sum(buf.count(b'\n') for buf in f_gen) | ||
|
||
def load_msrank_10k(dtype): | ||
""" | ||
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf | ||
TaskType:binclass | ||
NumberOfFeatures:700 | ||
NumberOfInstances:10100000 | ||
""" | ||
|
||
url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz" | ||
tar = DATASET_DIR + "msrank.tar.gz" | ||
|
||
if not os.path.isfile(tar): | ||
print("Loading data set...") | ||
download_file(url) | ||
|
||
if not os.path.isfile(DATASET_DIR + "MSRank/train.txt"): | ||
tar = tarfile.open(tar, "r:gz") | ||
tar.extractall(DATASET_DIR) | ||
tar.close() | ||
|
||
sets = [] | ||
labels = [] | ||
n_features = 137 | ||
|
||
print("Reading data set...") | ||
for set_name in ['train.txt', 'vali.txt', 'test.txt']: | ||
file_name = DATASET_DIR + os.path.join('MSRank', set_name) | ||
|
||
n_samples = _count_lines(file_name) | ||
with open(file_name, 'r') as file_obj: | ||
X, y = read_libsvm_msrank(file_obj, n_samples, n_features, dtype) | ||
|
||
sets.append(X) | ||
labels.append(y) | ||
|
||
sets[0] = np.vstack((sets[0], sets[1])) | ||
labels[0] = np.hstack((labels[0], labels[1])) | ||
|
||
sets = [ np.ascontiguousarray(sets[i]) for i in [0, 2]] | ||
labels = [ np.ascontiguousarray(labels[i]) for i in [0, 2]] | ||
|
||
n_classes = len(np.unique(labels[0])) | ||
|
||
return sets[0], labels[0], sets[1], labels[1], n_classes | ||
|
||
|
||
def load_airline_one_hot(dtype): | ||
""" | ||
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf | ||
TaskType:binclass | ||
NumberOfFeatures:700 | ||
NumberOfInstances:10100000 | ||
""" | ||
url = 'https://s3.amazonaws.com/benchm-ml--main/' | ||
|
||
name_train = 'train-10m.csv' | ||
name_test = 'test.csv' | ||
|
||
sets = [] | ||
labels = [] | ||
|
||
categorical_names = ["Month", "DayofMonth", "DayOfWeek", "UniqueCarrier", "Origin", "Dest"] | ||
categorical_ids = [0, 1, 2, 4, 5, 6] | ||
|
||
numeric_names = ["DepTime", "Distance"] | ||
numeric_ids = [3, 7] | ||
|
||
for name in [name_train, name_test]: | ||
filename = os.path.join(DATASET_DIR, name) | ||
if not os.path.exists(filename): | ||
print("Loading", filename) | ||
urlretrieve(url + name, filename) | ||
|
||
print("Reading", filename) | ||
df = pd.read_csv(filename, nrows=1000000) if name == 'train-10m.csv' else pd.read_csv(filename) | ||
X = df.drop('dep_delayed_15min', 1) | ||
y = df["dep_delayed_15min"] | ||
|
||
y_num = np.where(y == "Y", 1, 0) | ||
|
||
sets.append(X) | ||
labels.append(y_num) | ||
|
||
n_samples_train = sets[0].shape[0] | ||
|
||
X = pd.concat(sets) | ||
X = pd.get_dummies(X, columns=categorical_names) | ||
sets = [X[:n_samples_train], X[n_samples_train:]] | ||
|
||
return sets[0], labels[0], sets[1], labels[1], 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
#******************************************************************************* | ||
# Copyright 2017-2019 by Contributors | ||
# \file xgboost_hist_method_bench.py | ||
# \brief a benchmark for 'hist' tree_method on both CPU/GPU arhitectures | ||
# \author Egor Smirnov | ||
#******************************************************************************* | ||
|
||
import argparse | ||
import xgboost as xgb | ||
from bench_utils import * | ||
|
||
N_PERF_RUNS = 5 | ||
DTYPE=np.float32 | ||
|
||
xgb_params = { | ||
'alpha': 0.9, | ||
'max_bin': 256, | ||
'scale_pos_weight': 2, | ||
'learning_rate': 0.1, | ||
'subsample': 1, | ||
'reg_lambda': 1, | ||
"min_child_weight": 0, | ||
'max_depth': 8, | ||
'max_leaves': 2**8, | ||
} | ||
|
||
def xbg_fit(): | ||
global model_xgb | ||
dtrain = xgb.DMatrix(x_train, label=y_train) | ||
model_xgb = xgb.train(xgb_params, dtrain, xgb_params['n_estimators']) | ||
|
||
def xgb_predict_of_train_data(): | ||
global result_predict_xgb_train | ||
dtest = xgb.DMatrix(x_train) | ||
result_predict_xgb_train = model_xgb.predict(dtest) | ||
|
||
def xgb_predict_of_test_data(): | ||
global result_predict_xgb_test | ||
dtest = xgb.DMatrix(x_test) | ||
result_predict_xgb_test = model_xgb.predict(dtest) | ||
|
||
|
||
def load_dataset(dataset): | ||
global x_train, y_train, x_test, y_test | ||
|
||
try: | ||
os.mkdir(DATASET_DIR) | ||
except: | ||
pass | ||
|
||
datasets_dict = { | ||
'higgs1m': load_higgs1m, | ||
'msrank-10k': load_msrank_10k, | ||
'airline-ohe':load_airline_one_hot | ||
} | ||
|
||
x_train, y_train, x_test, y_test, n_classes = datasets_dict[dataset](DTYPE) | ||
|
||
if n_classes == -1: | ||
xgb_params['objective'] = 'reg:squarederror' | ||
elif n_classes == 2: | ||
xgb_params['objective'] = 'binary:logistic' | ||
else: | ||
xgb_params['objective'] = 'multi:softprob' | ||
xgb_params['num_class'] = n_classes | ||
|
||
def parse_args(): | ||
global N_PERF_RUNS | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--n_iter', required=False, type=int, default=1000) | ||
parser.add_argument('--n_runs', default=N_PERF_RUNS, required=False, type=int) | ||
parser.add_argument('--hw', choices=['cpu', 'gpu'], metavar='stage', required=False, default='cpu') | ||
parser.add_argument('--log', metavar='stage', required=False, type=bool, default=False) | ||
parser.add_argument('--dataset', choices=['higgs1m', "airline-ohe", "msrank-10k"], | ||
metavar='stage', required=True) | ||
|
||
args = parser.parse_args() | ||
N_PERF_RUNS = args.n_runs | ||
|
||
xgb_params['n_estimators'] = args.n_iter | ||
|
||
if args.log: | ||
xgb_params['verbosity'] = 3 | ||
else: | ||
xgb_params['silent'] = 1 | ||
|
||
if args.hw == "cpu": | ||
xgb_params['tree_method'] = 'hist' | ||
xgb_params['predictor'] = 'cpu_predictor' | ||
elif args.hw == "gpu": | ||
xgb_params['tree_method'] = 'gpu_hist' | ||
xgb_params['predictor'] = 'gpu_predictor' | ||
|
||
load_dataset(args.dataset) | ||
|
||
|
||
def main(): | ||
parse_args() | ||
|
||
print("Running ...") | ||
measure(xbg_fit, "XGBOOST training ", N_PERF_RUNS) | ||
measure(xgb_predict_of_train_data, "XGBOOST predict (train data)", N_PERF_RUNS) | ||
measure(xgb_predict_of_test_data, "XGBOOST predict (test data) ", N_PERF_RUNS) | ||
|
||
print("Compute quality metrics...") | ||
|
||
train_loglos = compute_logloss(y_train, result_predict_xgb_train) | ||
test_loglos = compute_logloss(y_test, result_predict_xgb_test) | ||
|
||
print("LogLoss for train data set = {:.6f}".format(train_loglos)) | ||
print("LogLoss for test data set = {:.6f}".format(test_loglos)) | ||
|
||
if __name__ == '__main__': | ||
main() |