Intel-bigdata · carsonwang · Jul 31, 2020 · Jul 15, 2020 · Jul 15, 2020 · Jul 15, 2020
diff --git a/README.md b/README.md
@@ -74,27 +74,31 @@ There are totally 27 workloads in HiBench. The workloads are divided into 6 cate
 
     Gradient-boosted trees (GBT) is a popular regression method using ensembles of decision trees. This workload is implemented in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator.
 
-6. Linear Regression (Linear)
+6. XGBoost (XGBoost)
+
+    XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. This workload is implemented with XGBoost4J-Spark API in spark.mllib and the input data set is generated by GradientBoostedTreeDataGenerator.    
+
+7. Linear Regression (Linear)
 
     Linear Regression (Linear) is a workload that implemented in spark.ml with ElasticNet. The input data set is generated by LinearRegressionDataGenerator.
 
-7. Latent Dirichlet Allocation (LDA)
+8. Latent Dirichlet Allocation (LDA)
 
     Latent Dirichlet allocation (LDA) is a topic model which infers topics from a collection of text documents. This workload is implemented in spark.mllib and the input data set is generated by LDADataGenerator.
 
-8. Principal Components Analysis (PCA)
+9. Principal Components Analysis (PCA)
 
     Principal component analysis (PCA) is a statistical method to find a rotation such that the first coordinate has the largest variance possible, and each succeeding coordinate in turn has the largest variance possible. PCA is used widely in dimensionality reduction. This workload is implemented in spark.ml. The input data set is generated by PCADataGenerator.
 
-9. Random Forest (RF)
+10. Random Forest (RF)
 
     Random forests (RF) are ensembles of decision trees. Random forests are one of the most successful machine learning models for classification and regression. They combine many decision trees in order to reduce the risk of overfitting. This workload is implemented in spark.mllib and the input data set is generated by RandomForestDataGenerator.
 
-10. Support Vector Machine (SVM)
+11. Support Vector Machine (SVM)
 
     Support Vector Machine (SVM) is a standard method for large-scale classification tasks. This workload is implemented in spark.mllib and the input data set is generated by SVMDataGenerator.
 
-11. Singular Value Decomposition (SVD)
+12. Singular Value Decomposition (SVD)
 
     Singular value decomposition (SVD) factorizes a matrix into three matrices. This workload is implemented in spark.mllib and its input data set is generated by SVDDataGenerator.
 

diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
@@ -127,6 +127,14 @@
     MAX_BINS_GBT="hibench.gbt.maxBins",
     NUM_ITERATIONS_GBT="hibench.gbt.numIterations",
     LEARNING_RATE_GBT="hibench.gbt.learningRate",
+    # For XGBoost
+    NUM_EXAMPLES_XGBOOST="hibench.xgboost.examples",
+    NUM_FEATURES_XGBOOST="hibench.xgboost.features",
+    NUM_CLASSES_XGBOOST="hibench.xgboost.numClasses",
+    MAX_DEPTH_XGBOOST="hibench.xgboost.maxDepth",
+    MAX_BINS_XGBOOST="hibench.xgboost.maxBins",
+    NUM_ITERATIONS_XGBOOST="hibench.xgboost.numIterations",
+    LEARNING_RATE_XGBOOST="hibench.xgboost.learningRate",
     # For Random Forest
     NUM_EXAMPLES_RF="hibench.rf.examples",
     NUM_FEATURES_RF="hibench.rf.features",

diff --git a/bin/workloads/ml/xgboost/prepare/prepare.sh b/bin/workloads/ml/xgboost/prepare/prepare.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/xgboost.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench XGBoostDataPrepare ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $INPUT_HDFS || true
+START_TIME=`timestamp`
+
+run_spark_job com.intel.hibench.sparkbench.ml.GradientBoostedTreeDataGenerator $INPUT_HDFS $NUM_EXAMPLES_XGBOOST $NUM_FEATURES_XGBOOST
+
+END_TIME=`timestamp`
+
+show_bannar finish
+leave_bench
+
diff --git a/bin/workloads/ml/xgboost/spark/run.sh b/bin/workloads/ml/xgboost/spark/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/xgboost.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench XGBoost ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $OUTPUT_HDFS || true
+
+SIZE=`dir_size $INPUT_HDFS`
+START_TIME=`timestamp`
+run_spark_job com.intel.hibench.sparkbench.ml.XGBoost --numClasses $NUM_CLASSES_XGBOOST --maxDepth $MAX_DEPTH_XGBOOST --maxBins $MAX_BINS_XGBOOST --numIterations $NUM_ITERATIONS_XGBOOST --learningRate $LEARNING_RATE_XGBOOST $INPUT_HDFS
+END_TIME=`timestamp`
+
+gen_report ${START_TIME} ${END_TIME} ${SIZE}
+show_bannar finish
+leave_bench
diff --git a/conf/workloads/ml/xgboost.conf b/conf/workloads/ml/xgboost.conf
@@ -0,0 +1,26 @@
+hibench.xgboost.tiny.examples               10
+hibench.xgboost.tiny.features               100
+hibench.xgboost.small.examples              100
+hibench.xgboost.small.features              500
+hibench.xgboost.large.examples              1000
+hibench.xgboost.large.features              2000
+hibench.xgboost.huge.examples               1000
+hibench.xgboost.huge.features               4000
+hibench.xgboost.gigantic.examples           1000
+hibench.xgboost.gigantic.features           8000
+hibench.xgboost.bigdata.examples            1000
+hibench.xgboost.bigdata.features            12000
+
+
+hibench.xgboost.examples                    ${hibench.xgboost.${hibench.scale.profile}.examples}
+hibench.xgboost.features                    ${hibench.xgboost.${hibench.scale.profile}.features}
+hibench.xgboost.partitions                  ${hibench.default.map.parallelism}
+
+hibench.xgboost.numClasses                  2
+hibench.xgboost.maxDepth                    30
+hibench.xgboost.maxBins                     32
+hibench.xgboost.numIterations               20
+hibench.xgboost.learningRate                0.1
+
+hibench.workload.input                  ${hibench.hdfs.data.dir}/XGBoost/Input
+hibench.workload.output                 ${hibench.hdfs.data.dir}/XGBoost/Output
diff --git a/sparkbench/ml/pom.xml b/sparkbench/ml/pom.xml
@@ -53,5 +53,15 @@
       <artifactId>mahout-math</artifactId>
       <version>${mahout.version}</version>
     </dependency>
+      <dependency>
+        <groupId>ml.dmlc</groupId>
+        <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+        <version>1.0.0</version>
+      </dependency>
+      <dependency>
+        <groupId>ml.dmlc</groupId>
+        <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
+        <version>1.0.0</version>
+      </dependency>
   </dependencies>
 </project>
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.hibench.sparkbench.ml
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.mllib.tree.GradientBoostedTrees
+import org.apache.spark.mllib.tree.configuration.BoostingStrategy
+import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
+import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
+import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
+import org.apache.spark.sql.SparkSession
+import scopt.OptionParser
+
+object XGBoost {
+
+  case class Params(
+    numClasses: Int = 2,
+    maxDepth: Int = 30,
+    maxBins: Int = 32,
+    numIterations: Int = 20,
+    learningRate: Double = 0.1,
+    dataPath: String = null
+  )
+
+  def main(args: Array[String]): Unit = {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("XGBoost"){
+      head("XGBoost: use XGBoost for classification")
+      opt[Int]("numClasses")
+        .text(s"numClasses, default: ${defaultParams.numClasses}")
+        .action((x,c) => c.copy(numClasses = x))
+      opt[Int]("maxDepth")
+        .text(s"maxDepth, default: ${defaultParams.maxDepth}")
+        .action((x,c) => c.copy(maxDepth = x))
+      opt[Int]("maxBins")
+        .text(s"maxBins, default: ${defaultParams.maxBins}")
+        .action((x,c) => c.copy(maxBins = x))
+      opt[Int]("numIterations")
+        .text(s"numIterations, default: ${defaultParams.numIterations}")
+        .action((x,c) => c.copy(numIterations = x))
+      opt[Double]("learningRate")
+        .text(s"learningRate, default: ${defaultParams.learningRate}")
+        .action((x,c) => c.copy(learningRate = x))
+      arg[String]("<dataPath>")
+        .required()
+        .text("data path for XGBoost")
+        .action((x,c) => c.copy(dataPath = x))
+    }
+    parser.parse(args, defaultParams) match {
+      case Some(params) => run(params)
+      case _ => sys.exit(1)
+    }
+  }
+
+  def run(params: Params): Unit = {
+
+    val spark = SparkSession
+      .builder
+      .appName(s"XGBoost with $params")
+      .getOrCreate()
+
+    val sc = spark.sparkContext
+
+    import spark.implicits._
+
+    val dataPath = params.dataPath
+    val numClasses = params.numClasses
+    val maxDepth = params.maxDepth
+    val maxBins = params.maxBins
+    val numIterations = params.numIterations
+    val learningRate = params.learningRate
+
+    // Load data file.
+    val mllibRDD: RDD[LabeledPoint] = sc.objectFile(dataPath)
+    // Convert to ML LabeledPoint and to DataFrame
+    val mlRDD: RDD[NewLabeledPoint] = mllibRDD.map { p => NewLabeledPoint(p.label, p.features.asML) }
+    val data = mlRDD.toDF
+
+    // Split the data into training and test sets (30% held out for testing)
+    val splits = data.randomSplit(Array(0.7, 0.3))
+    val (trainingData, testData) = (splits(0), splits(1))
+
+    val numWorkers = sc.getConf.getInt("spark.executor.instances", -1)
+    val numThreads = sc.getConf.getInt("spark.executor.cores", -1)
+    val taskCPUs = sc.getConf.getInt("spark.task.cpus", -1)
+
+    if (numWorkers == -1 || numThreads == -1 || taskCPUs == -1) {
+      println("XGBoost error: should set spark.executor.instances, " +
+        "spark.executor.cores and spark.task.cpus in Spark Config")
+      sys.exit(1)
+    }
+
+    val xgbParam = Map("eta" -> learningRate,
+      "num_round" -> numIterations,
+      "eta" -> learningRate,
+      "num_class" -> numClasses,
+      "max_depth" -> maxDepth,
+      "max_bin" -> maxBins,
+      "objective" -> "multi:softprob",
+      "num_workers" -> numWorkers,
+      "nthread" -> numThreads
+    )
+    val xgbClassifier = new XGBoostClassifier(xgbParam).
+      setFeaturesCol("features").
+      setLabelCol("label")
+
+    val model = xgbClassifier.fit(trainingData)
+
+    // Make predictions.
+    val predictions = model.transform(testData)
+
+    // Select (prediction, true label) and compute test error.
+    val evaluator = new MulticlassClassificationEvaluator()
+      .setLabelCol("label")
+      .setPredictionCol("prediction")
+      .setMetricName("accuracy")
+    val accuracy = evaluator.evaluate(predictions)
+    println(s"Test Error = ${1.0 - accuracy}")
+
+    sc.stop()
+  }
+}