Simplifications to DecisionTree code:

jkbradley · jkbradley · commit 2ab763b2ca1b · 2014-09-09T10:42:46.000-07:00
No longer pre-allocate parentImpurities array in main train() method.
* parentImpurities values are now stored in individual nodes (in Node.stats.impurity).

No longer using Node.build since tree structure is constructed on-the-fly.
* Did not eliminate since it is public (Developer) API.

Also: Updated DecisionTreeSuite test "Second level node building with vs. without groups"
* generateOrderedLabeledPoints() modified so that it really does require 2 levels of internal nodes.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -89,8 +89,6 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
       s"DecisionTree currently only supports maxDepth <= 30, but was given maxDepth = $maxDepth.")
     // Number of nodes to allocate: max number of nodes possible given the depth of the tree, plus 1
     val maxNumNodesPlus1 = Node.startIndexInLevel(maxDepth + 1)
-    // Initialize an array to hold parent impurity calculations for each node.
-    val parentImpurities = new Array[Double](maxNumNodesPlus1)
     // dummy value for top node (updated during first split calculation)
     val nodes = new Array[Node](maxNumNodesPlus1)
 
@@ -131,7 +129,7 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
       // Find best split for all nodes at a level.
       timer.start("findBestSplits")
       val splitsStatsForLevel: Array[(Split, InformationGainStats)] =
-        DecisionTree.findBestSplits(treeInput, parentImpurities,
+        DecisionTree.findBestSplits(treeInput,
           metadata, level, nodes, splits, bins, maxLevelForSingleGroup, timer)
       timer.stop("findBestSplits")
 
@@ -158,20 +156,12 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
             nodes(parentNodeIndex).rightNode = Some(nodes(nodeIndex))
           }
         }
-        // Extract info for nodes at the next lower level.
-        timer.start("extractInfoForLowerLevels")
         if (level < maxDepth) {
-          val leftChildIndex = Node.leftChildIndex(nodeIndex)
-          val leftImpurity = stats.leftImpurity
-          logDebug("leftChildIndex = " + leftChildIndex + ", impurity = " + leftImpurity)
-          parentImpurities(leftChildIndex) = leftImpurity
-
-          val rightChildIndex = Node.rightChildIndex(nodeIndex)
-          val rightImpurity = stats.rightImpurity
-          logDebug("rightChildIndex = " + rightChildIndex + ", impurity = " + rightImpurity)
-          parentImpurities(rightChildIndex) = rightImpurity
+          logDebug("leftChildIndex = " + Node.leftChildIndex(nodeIndex) +
+            ", impurity = " + stats.leftImpurity)
+          logDebug("rightChildIndex = " + Node.rightChildIndex(nodeIndex) +
+            ", impurity = " + stats.rightImpurity)
         }
-        timer.stop("extractInfoForLowerLevels")
         logDebug("final best split = " + split)
       }
       require(Node.maxNodesInLevel(level) == splitsStatsForLevel.length)
@@ -189,17 +179,12 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
     logDebug("Extracting tree model")
     logDebug("#####################################")
 
-    // Initialize the top or root node of the tree.
-    val topNode = nodes(1)
-    // Build the full tree using the node info calculated in the level-wise best split calculations.
-    topNode.build(nodes)
-
     timer.stop("total")
 
     logInfo("Internal timing for DecisionTree:")
     logInfo(s"$timer")
 
-    new DecisionTreeModel(topNode, strategy.algo)
+    new DecisionTreeModel(nodes(1), strategy.algo)
   }
 
 }
@@ -408,7 +393,6 @@ object DecisionTree extends Serializable with Logging {
    * multiple groups if the level-wise training task could lead to memory overflow.
    *
    * @param input Training data: RDD of [[org.apache.spark.mllib.tree.impl.TreePoint]]
-   * @param parentImpurities Impurities for all parent nodes for the current level
    * @param metadata Learning and dataset metadata
    * @param level Level of the tree
    * @param splits possible splits for all features, indexed (numFeatures)(numSplits)
@@ -418,7 +402,6 @@ object DecisionTree extends Serializable with Logging {
    */
   private[tree] def findBestSplits(
       input: RDD[TreePoint],
-      parentImpurities: Array[Double],
       metadata: DecisionTreeMetadata,
       level: Int,
       nodes: Array[Node],
@@ -438,14 +421,14 @@ object DecisionTree extends Serializable with Logging {
       // Iterate over each group of nodes at a level.
       var groupIndex = 0
       while (groupIndex < numGroups) {
-        val bestSplitsForGroup = findBestSplitsPerGroup(input, parentImpurities, metadata, level,
+        val bestSplitsForGroup = findBestSplitsPerGroup(input, metadata, level,
           nodes, splits, bins, timer, numGroups, groupIndex)
         bestSplits = Array.concat(bestSplits, bestSplitsForGroup)
         groupIndex += 1
       }
       bestSplits
     } else {
-      findBestSplitsPerGroup(input, parentImpurities, metadata, level, nodes, splits, bins, timer)
+      findBestSplitsPerGroup(input, metadata, level, nodes, splits, bins, timer)
     }
   }
 
@@ -585,7 +568,6 @@ object DecisionTree extends Serializable with Logging {
    * Returns an array of optimal splits for a group of nodes at a given level
    *
    * @param input Training data: RDD of [[org.apache.spark.mllib.tree.impl.TreePoint]]
-   * @param parentImpurities Impurities for all parent nodes for the current level
    * @param metadata Learning and dataset metadata
    * @param level Level of the tree
    * @param nodes Array of all nodes in the tree.  Used for matching data points to nodes.
@@ -597,7 +579,6 @@ object DecisionTree extends Serializable with Logging {
    */
   private def findBestSplitsPerGroup(
       input: RDD[TreePoint],
-      parentImpurities: Array[Double],
       metadata: DecisionTreeMetadata,
       level: Int,
       nodes: Array[Node],
@@ -709,10 +690,8 @@ object DecisionTree extends Serializable with Logging {
     // Iterating over all nodes at this level
     var nodeIndex = 0
     while (nodeIndex < numNodes) {
-      val nodeImpurity = parentImpurities(globalNodeIndexOffset + nodeIndex)
-      logDebug("node impurity = " + nodeImpurity)
       bestSplits(nodeIndex) =
-        binsToBestSplit(binAggregates, nodeIndex, nodeImpurity, level, metadata, splits)
+        binsToBestSplit(binAggregates, nodeIndex, level, metadata, splits)
       logDebug("best split = " + bestSplits(nodeIndex)._1)
       nodeIndex += 1
     }
@@ -725,13 +704,11 @@ object DecisionTree extends Serializable with Logging {
    * Calculate the information gain for a given (feature, split) based upon left/right aggregates.
    * @param leftImpurityCalculator left node aggregates for this (feature, split)
    * @param rightImpurityCalculator right node aggregate for this (feature, split)
-   * @param topImpurity impurity of the parent node
    * @return information gain and statistics for all splits
    */
   private def calculateGainForSplit(
       leftImpurityCalculator: ImpurityCalculator,
       rightImpurityCalculator: ImpurityCalculator,
-      topImpurity: Double,
       level: Int,
       metadata: DecisionTreeMetadata): InformationGainStats = {
 
@@ -741,18 +718,13 @@ object DecisionTree extends Serializable with Logging {
     val totalCount = leftCount + rightCount
     if (totalCount == 0) {
       // Return arbitrary prediction.
-      return new InformationGainStats(0, topImpurity, topImpurity, topImpurity, 0)
+      return new InformationGainStats(0, 0, 0, 0, 0)
     }
 
     val parentNodeAgg = leftImpurityCalculator.copy
     parentNodeAgg.add(rightImpurityCalculator)
-    // impurity of parent node
-    val impurity = if (level > 0) {
-      topImpurity
-    } else {
-      parentNodeAgg.calculate()
-    }
 
+    val impurity = parentNodeAgg.calculate()
     val predict = parentNodeAgg.predict
     val prob = parentNodeAgg.prob(predict)
 
@@ -771,19 +743,15 @@ object DecisionTree extends Serializable with Logging {
    * Find the best split for a node.
    * @param binAggregates Bin statistics.
    * @param nodeIndex Index for node to split in this (level, group).
-   * @param nodeImpurity Impurity of the node (nodeIndex).
    * @return tuple for best split: (Split, information gain)
    */
   private def binsToBestSplit(
       binAggregates: DTStatsAggregator,
       nodeIndex: Int,
-      nodeImpurity: Double,
       level: Int,
       metadata: DecisionTreeMetadata,
       splits: Array[Array[Split]]): (Split, InformationGainStats) = {
 
-    logDebug("node impurity = " + nodeImpurity)
-
     // For each (feature, split), calculate the gain, and select the best (feature, split).
     Range(0, metadata.numFeatures).map { featureIndex =>
       val numSplits = metadata.numSplits(featureIndex)
@@ -803,8 +771,7 @@ object DecisionTree extends Serializable with Logging {
             val leftChildStats = binAggregates.getImpurityCalculator(nodeFeatureOffset, splitIdx)
             val rightChildStats = binAggregates.getImpurityCalculator(nodeFeatureOffset, numSplits)
             rightChildStats.subtract(leftChildStats)
-            val gainStats =
-              calculateGainForSplit(leftChildStats, rightChildStats, nodeImpurity, level, metadata)
+            val gainStats = calculateGainForSplit(leftChildStats, rightChildStats, level, metadata)
             (splitIdx, gainStats)
           }.maxBy(_._2.gain)
         (splits(featureIndex)(bestFeatureSplitIndex), bestFeatureGainStats)
@@ -816,8 +783,7 @@ object DecisionTree extends Serializable with Logging {
           Range(0, numSplits).map { splitIndex =>
             val leftChildStats = binAggregates.getImpurityCalculator(leftChildOffset, splitIndex)
             val rightChildStats = binAggregates.getImpurityCalculator(rightChildOffset, splitIndex)
-            val gainStats =
-              calculateGainForSplit(leftChildStats, rightChildStats, nodeImpurity, level, metadata)
+            val gainStats = calculateGainForSplit(leftChildStats, rightChildStats, level, metadata)
             (splitIndex, gainStats)
           }.maxBy(_._2.gain)
         (splits(featureIndex)(bestFeatureSplitIndex), bestFeatureGainStats)
@@ -887,8 +853,7 @@ object DecisionTree extends Serializable with Logging {
             val rightChildStats =
               binAggregates.getImpurityCalculator(nodeFeatureOffset, lastCategory)
             rightChildStats.subtract(leftChildStats)
-            val gainStats =
-              calculateGainForSplit(leftChildStats, rightChildStats, nodeImpurity, level, metadata)
+            val gainStats = calculateGainForSplit(leftChildStats, rightChildStats, level, metadata)
             (splitIndex, gainStats)
           }.maxBy(_._2.gain)
         val categoriesForSplit =
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -271,7 +271,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     assert(bins(0).length === 0)
 
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(8), metadata, 0,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 0,
       new Array[Node](0), splits, bins, 10)
 
     val split = bestSplits(0)._1
@@ -303,7 +303,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
 
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, metadata)
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(8), metadata, 0,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 0,
       new Array[Node](0), splits, bins, 10)
 
     val split = bestSplits(0)._1
@@ -357,7 +357,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     assert(bins(0).length === 100)
 
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(8), metadata, 0,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 0,
       new Array[Node](0), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
@@ -385,7 +385,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     assert(bins(0).length === 100)
 
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(2), metadata, 0,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 0,
       new Array[Node](0), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
@@ -414,7 +414,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     assert(bins(0).length === 100)
 
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(2), metadata, 0,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 0,
       new Array[Node](0), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
@@ -443,7 +443,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     assert(bins(0).length === 100)
 
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(2), metadata, 0,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 0,
       new Array[Node](0), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
@@ -468,26 +468,25 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     assert(bins(0).length === 100)
 
     // Train a 1-node model
-    val strategyOneNode = new Strategy(Classification, Entropy, 1, 2, 100)
+    val strategyOneNode = new Strategy(Classification, Entropy, maxDepth = 1,
+      numClassesForClassification = 2, maxBins = 100)
     val modelOneNode = DecisionTree.train(rdd, strategyOneNode)
     val nodes: Array[Node] = new Array[Node](8)
     nodes(1) = modelOneNode.topNode
     nodes(1).leftNode = None
     nodes(1).rightNode = None
 
-    val parentImpurities = Array(0, 0.5, 0.5, 0.5)
-
     // Single group second level tree construction.
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, parentImpurities, metadata, 1, nodes,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 1, nodes,
       splits, bins, 10)
     assert(bestSplits.length === 2)
     assert(bestSplits(0)._2.gain > 0)
     assert(bestSplits(1)._2.gain > 0)
 
     // maxLevelForSingleGroup parameter is set to 0 to force splitting into groups for second
     // level tree construction.
-    val bestSplitsWithGroups = DecisionTree.findBestSplits(treeInput, parentImpurities, metadata, 1,
+    val bestSplitsWithGroups = DecisionTree.findBestSplits(treeInput, metadata, 1,
       nodes, splits, bins, 0)
     assert(bestSplitsWithGroups.length === 2)
     assert(bestSplitsWithGroups(0)._2.gain > 0)
@@ -517,7 +516,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
 
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, metadata)
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(32), metadata, 0,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 0,
       new Array[Node](0), splits, bins, 10)
 
     assert(bestSplits.length === 1)
@@ -582,7 +581,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
 
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, metadata)
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(32), metadata, 0,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 0,
       new Array[Node](0), splits, bins, 10)
 
     assert(bestSplits.length === 1)
@@ -609,7 +608,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
 
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, metadata)
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(32), metadata, 0,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 0,
       new Array[Node](0), splits, bins, 10)
 
     assert(bestSplits.length === 1)
@@ -636,7 +635,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
 
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, metadata)
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(32), metadata, 0,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 0,
       new Array[Node](0), splits, bins, 10)
 
     assert(bestSplits.length === 1)
@@ -660,7 +659,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
 
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, metadata)
     val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
-    val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(32), metadata, 0,
+    val bestSplits = DecisionTree.findBestSplits(treeInput, metadata, 0,
       new Array[Node](0), splits, bins, 10)
 
     assert(bestSplits.length === 1)
@@ -709,13 +708,16 @@ object DecisionTreeSuite {
   def generateOrderedLabeledPoints(): Array[LabeledPoint] = {
     val arr = new Array[LabeledPoint](1000)
     for (i <- 0 until 1000) {
-      if (i < 600) {
-        val lp = new LabeledPoint(0.0, Vectors.dense(i.toDouble, 1000.0 - i))
-        arr(i) = lp
+      val label = if (i < 100) {
+        0.0
+      } else if (i < 500) {
+        1.0
+      } else if (i < 900) {
+        0.0
       } else {
-        val lp = new LabeledPoint(1.0, Vectors.dense(i.toDouble, 1000.0 - i))
-        arr(i) = lp
+        1.0
       }
+      arr(i) = new LabeledPoint(label, Vectors.dense(i.toDouble, 1000.0 - i))
     }
     arr
   }