diff --git a/README.md b/README.md
index 8217280bf..ec4d114d3 100644
--- a/README.md
+++ b/README.md
@@ -74,27 +74,31 @@ There are totally 27 workloads in HiBench. The workloads are divided into 6 cate
Gradient-boosted trees (GBT) is a popular regression method using ensembles of decision trees. This workload is implemented in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator.
-6. Linear Regression (Linear)
+6. XGBoost (XGBoost)
+
+ XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. This workload is implemented with XGBoost4J-Spark API in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator.
+
+7. Linear Regression (Linear)
Linear Regression (Linear) is a workload that implemented in spark.ml with ElasticNet. The input data set is generated by LinearRegressionDataGenerator.
-7. Latent Dirichlet Allocation (LDA)
+8. Latent Dirichlet Allocation (LDA)
Latent Dirichlet allocation (LDA) is a topic model which infers topics from a collection of text documents. This workload is implemented in spark.mllib and the input data set is generated by LDADataGenerator.
-8. Principal Components Analysis (PCA)
+9. Principal Components Analysis (PCA)
Principal component analysis (PCA) is a statistical method to find a rotation such that the first coordinate has the largest variance possible, and each succeeding coordinate in turn has the largest variance possible. PCA is used widely in dimensionality reduction. This workload is implemented in spark.ml. The input data set is generated by PCADataGenerator.
-9. Random Forest (RF)
+10. Random Forest (RF)
Random forests (RF) are ensembles of decision trees. Random forests are one of the most successful machine learning models for classification and regression. They combine many decision trees in order to reduce the risk of overfitting. This workload is implemented in spark.mllib and the input data set is generated by RandomForestDataGenerator.
-10. Support Vector Machine (SVM)
+11. Support Vector Machine (SVM)
Support Vector Machine (SVM) is a standard method for large-scale classification tasks. This workload is implemented in spark.mllib and the input data set is generated by SVMDataGenerator.
-11. Singular Value Decomposition (SVD)
+12. Singular Value Decomposition (SVD)
Singular value decomposition (SVD) factorizes a matrix into three matrices. This workload is implemented in spark.mllib and its input data set is generated by SVDDataGenerator.
diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
index 272faac2e..5e7f04f44 100644
--- a/bin/functions/hibench_prop_env_mapping.py
+++ b/bin/functions/hibench_prop_env_mapping.py
@@ -127,6 +127,14 @@
MAX_BINS_GBT="hibench.gbt.maxBins",
NUM_ITERATIONS_GBT="hibench.gbt.numIterations",
LEARNING_RATE_GBT="hibench.gbt.learningRate",
+ # For XGBoost
+ NUM_EXAMPLES_XGBOOST="hibench.xgboost.examples",
+ NUM_FEATURES_XGBOOST="hibench.xgboost.features",
+ NUM_CLASSES_XGBOOST="hibench.xgboost.numClasses",
+ MAX_DEPTH_XGBOOST="hibench.xgboost.maxDepth",
+ MAX_BINS_XGBOOST="hibench.xgboost.maxBins",
+ NUM_ITERATIONS_XGBOOST="hibench.xgboost.numIterations",
+ LEARNING_RATE_XGBOOST="hibench.xgboost.learningRate",
# For Random Forest
NUM_EXAMPLES_RF="hibench.rf.examples",
NUM_FEATURES_RF="hibench.rf.features",
diff --git a/bin/workloads/ml/xgboost/prepare/prepare.sh b/bin/workloads/ml/xgboost/prepare/prepare.sh
new file mode 100755
index 000000000..9506ef163
--- /dev/null
+++ b/bin/workloads/ml/xgboost/prepare/prepare.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/xgboost.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench XGBoostDataPrepare ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $INPUT_HDFS || true
+START_TIME=`timestamp`
+
+run_spark_job com.intel.hibench.sparkbench.ml.GradientBoostedTreeDataGenerator $INPUT_HDFS $NUM_EXAMPLES_XGBOOST $NUM_FEATURES_XGBOOST
+
+END_TIME=`timestamp`
+
+show_bannar finish
+leave_bench
+
diff --git a/bin/workloads/ml/xgboost/spark/run.sh b/bin/workloads/ml/xgboost/spark/run.sh
new file mode 100755
index 000000000..76d9c2caf
--- /dev/null
+++ b/bin/workloads/ml/xgboost/spark/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/xgboost.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench XGBoost ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $OUTPUT_HDFS || true
+
+SIZE=`dir_size $INPUT_HDFS`
+START_TIME=`timestamp`
+run_spark_job com.intel.hibench.sparkbench.ml.XGBoost --numClasses $NUM_CLASSES_XGBOOST --maxDepth $MAX_DEPTH_XGBOOST --maxBins $MAX_BINS_XGBOOST --numIterations $NUM_ITERATIONS_XGBOOST --learningRate $LEARNING_RATE_XGBOOST $INPUT_HDFS
+END_TIME=`timestamp`
+
+gen_report ${START_TIME} ${END_TIME} ${SIZE}
+show_bannar finish
+leave_bench
diff --git a/conf/workloads/ml/xgboost.conf b/conf/workloads/ml/xgboost.conf
new file mode 100644
index 000000000..7788f6242
--- /dev/null
+++ b/conf/workloads/ml/xgboost.conf
@@ -0,0 +1,26 @@
+hibench.xgboost.tiny.examples 10
+hibench.xgboost.tiny.features 100
+hibench.xgboost.small.examples 100
+hibench.xgboost.small.features 500
+hibench.xgboost.large.examples 1000
+hibench.xgboost.large.features 2000
+hibench.xgboost.huge.examples 1000
+hibench.xgboost.huge.features 4000
+hibench.xgboost.gigantic.examples 1000
+hibench.xgboost.gigantic.features 8000
+hibench.xgboost.bigdata.examples 1000
+hibench.xgboost.bigdata.features 12000
+
+
+hibench.xgboost.examples ${hibench.xgboost.${hibench.scale.profile}.examples}
+hibench.xgboost.features ${hibench.xgboost.${hibench.scale.profile}.features}
+hibench.xgboost.partitions ${hibench.default.map.parallelism}
+
+hibench.xgboost.numClasses 2
+hibench.xgboost.maxDepth 30
+hibench.xgboost.maxBins 32
+hibench.xgboost.numIterations 20
+hibench.xgboost.learningRate 0.1
+
+hibench.workload.input ${hibench.hdfs.data.dir}/XGBoost/Input
+hibench.workload.output ${hibench.hdfs.data.dir}/XGBoost/Output
diff --git a/sparkbench/ml/pom.xml b/sparkbench/ml/pom.xml
index d57898a56..44ae66cc9 100644
--- a/sparkbench/ml/pom.xml
+++ b/sparkbench/ml/pom.xml
@@ -53,5 +53,15 @@
mahout-math
${mahout.version}
+
+ ml.dmlc
+ xgboost4j_${scala.binary.version}
+ 1.0.0
+
+
+ ml.dmlc
+ xgboost4j-spark_${scala.binary.version}
+ 1.0.0
+
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
new file mode 100644
index 000000000..9f68e5abd
--- /dev/null
+++ b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.hibench.sparkbench.ml
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.mllib.tree.GradientBoostedTrees
+import org.apache.spark.mllib.tree.configuration.BoostingStrategy
+import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
+import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
+import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
+import org.apache.spark.sql.SparkSession
+import scopt.OptionParser
+
+object XGBoost {
+
+ case class Params(
+ numClasses: Int = 2,
+ maxDepth: Int = 30,
+ maxBins: Int = 32,
+ numIterations: Int = 20,
+ learningRate: Double = 0.1,
+ dataPath: String = null
+ )
+
+ def main(args: Array[String]): Unit = {
+ val defaultParams = Params()
+
+ val parser = new OptionParser[Params]("XGBoost"){
+ head("XGBoost: use XGBoost for classification")
+ opt[Int]("numClasses")
+ .text(s"numClasses, default: ${defaultParams.numClasses}")
+ .action((x,c) => c.copy(numClasses = x))
+ opt[Int]("maxDepth")
+ .text(s"maxDepth, default: ${defaultParams.maxDepth}")
+ .action((x,c) => c.copy(maxDepth = x))
+ opt[Int]("maxBins")
+ .text(s"maxBins, default: ${defaultParams.maxBins}")
+ .action((x,c) => c.copy(maxBins = x))
+ opt[Int]("numIterations")
+ .text(s"numIterations, default: ${defaultParams.numIterations}")
+ .action((x,c) => c.copy(numIterations = x))
+ opt[Double]("learningRate")
+ .text(s"learningRate, default: ${defaultParams.learningRate}")
+ .action((x,c) => c.copy(learningRate = x))
+ arg[String]("")
+ .required()
+ .text("data path for XGBoost")
+ .action((x,c) => c.copy(dataPath = x))
+ }
+ parser.parse(args, defaultParams) match {
+ case Some(params) => run(params)
+ case _ => sys.exit(1)
+ }
+ }
+
+ def run(params: Params): Unit = {
+
+ val spark = SparkSession
+ .builder
+ .appName(s"XGBoost with $params")
+ .getOrCreate()
+
+ val sc = spark.sparkContext
+
+ import spark.implicits._
+
+ val dataPath = params.dataPath
+ val numClasses = params.numClasses
+ val maxDepth = params.maxDepth
+ val maxBins = params.maxBins
+ val numIterations = params.numIterations
+ val learningRate = params.learningRate
+
+ // Load data file.
+ val mllibRDD: RDD[LabeledPoint] = sc.objectFile(dataPath)
+ // Convert to ML LabeledPoint and to DataFrame
+ val mlRDD: RDD[NewLabeledPoint] = mllibRDD.map { p => NewLabeledPoint(p.label, p.features.asML) }
+ val data = mlRDD.toDF
+
+ // Split the data into training and test sets (30% held out for testing)
+ val splits = data.randomSplit(Array(0.7, 0.3))
+ val (trainingData, testData) = (splits(0), splits(1))
+
+ val numWorkers = sc.getConf.getInt("spark.executor.instances", -1)
+ val numThreads = sc.getConf.getInt("spark.executor.cores", -1)
+ val taskCPUs = sc.getConf.getInt("spark.task.cpus", -1)
+
+ if (numWorkers == -1 || numThreads == -1 || taskCPUs == -1) {
+ println("XGBoost error: should set spark.executor.instances, " +
+ "spark.executor.cores and spark.task.cpus in Spark Config")
+ sys.exit(1)
+ }
+
+ val xgbParam = Map("eta" -> learningRate,
+ "num_round" -> numIterations,
+ "eta" -> learningRate,
+ "num_class" -> numClasses,
+ "max_depth" -> maxDepth,
+ "max_bin" -> maxBins,
+ "objective" -> "multi:softprob",
+ "num_workers" -> numWorkers,
+ "nthread" -> numThreads
+ )
+ val xgbClassifier = new XGBoostClassifier(xgbParam).
+ setFeaturesCol("features").
+ setLabelCol("label")
+
+ val model = xgbClassifier.fit(trainingData)
+
+ // Make predictions.
+ val predictions = model.transform(testData)
+
+ // Select (prediction, true label) and compute test error.
+ val evaluator = new MulticlassClassificationEvaluator()
+ .setLabelCol("label")
+ .setPredictionCol("prediction")
+ .setMetricName("accuracy")
+ val accuracy = evaluator.evaluate(predictions)
+ println(s"Test Error = ${1.0 - accuracy}")
+
+ sc.stop()
+ }
+}