adjust comments and check child nodes' impurity

chouqin · chouqin · commit eefeef10ca43 · 2014-10-09T09:23:12.000+08:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -533,14 +533,11 @@ object DecisionTree extends Serializable with Logging {
     }
 
     // array of nodes to train indexed by node index in group
-    val nodes = {
-      val nodes = Array.fill[Node](numNodes)(null)
-      nodesForGroup.foreach { case (treeIndex, nodesForTree) =>
-        nodesForTree.foreach { node =>
-          nodes(treeToNodeToIndexInfo(treeIndex)(node.id).nodeIndexInGroup) = node
-        }
+    val nodes = Array.fill[Node](numNodes)(null)
+    nodesForGroup.foreach { case (treeIndex, nodesForTree) =>
+      nodesForTree.foreach { node =>
+        nodes(treeToNodeToIndexInfo(treeIndex)(node.id).nodeIndexInGroup) = node
       }
-      nodes
     }
 
     // Calculate best splits for all nodes in the group
@@ -607,14 +604,18 @@ object DecisionTree extends Serializable with Logging {
         if (!isLeaf) {
           node.split = Some(split)
           val childIsLeaf = (Node.indexToLevel(nodeIndex) + 1) == metadata.maxDepth
+          val leftChildIsLeaf = childIsLeaf || (stats.leftImpurity == 0.0)
+          val rightChildIsLeaf = childIsLeaf || (stats.rightImpurity == 0.0)
           node.leftNode = Some(Node(Node.leftChildIndex(nodeIndex),
-            stats.leftPredict, stats.leftImpurity, childIsLeaf))
+            stats.leftPredict, stats.leftImpurity, leftChildIsLeaf))
           node.rightNode = Some(Node(Node.rightChildIndex(nodeIndex),
-            stats.rightPredict, stats.rightImpurity, childIsLeaf))
+            stats.rightPredict, stats.rightImpurity, rightChildIsLeaf))
 
           // enqueue left child and right child if they are not leaves
-          if (!childIsLeaf) {
+          if (!leftChildIsLeaf) {
             nodeQueue.enqueue((treeIndex, node.leftNode.get))
+          }
+          if (!rightChildIsLeaf) {
             nodeQueue.enqueue((treeIndex, node.rightNode.get))
           }
 
@@ -691,11 +692,10 @@ object DecisionTree extends Serializable with Logging {
       rightImpurityCalculator: ImpurityCalculator): (Predict, Double) =  {
     val parentNodeAgg = leftImpurityCalculator.copy
     parentNodeAgg.add(rightImpurityCalculator)
-    val predict = parentNodeAgg.predict
-    val prob = parentNodeAgg.prob(predict)
+    val predict = calculatePredict(parentNodeAgg)
     val impurity = parentNodeAgg.calculate()
 
-    (new Predict(predict, prob), impurity)
+    (predict, impurity)
   }
 
   /**
@@ -709,7 +709,7 @@ object DecisionTree extends Serializable with Logging {
       featuresForNode: Option[Array[Int]],
       node: Node): (Split, InformationGainStats, Predict) = {
 
-    // calculate predict and impurity if current node are top node
+    // calculate predict and impurity if current node is top node
     val level = Node.indexToLevel(node.id)
     var predictWithImpurity: Option[(Predict, Double)] = if (level == 0) {
       None
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
@@ -33,7 +33,7 @@ import org.apache.spark.mllib.linalg.Vector
  * @param id integer node id, from 1
  * @param predict predicted value at the node
  * @param impurity current node impurity
- * @param isLeaf whether the leaf is a node
+ * @param isLeaf whether the node is a leaf
  * @param split split to calculate left and right nodes
  * @param leftNode  left child
  * @param rightNode right child
@@ -179,13 +179,13 @@ private[tree] object Node {
   /**
    * Construct a node with nodeIndex, predict, impurity and isLeaf parameters.
    * This is used in `DecisionTree.findBestSplits` to construct child nodes
-   * after find best splits for each node.
+   * after finding the best splits for parent nodes.
    * Other fields are set at next level.
    * @param nodeIndex integer node id, from 1
    * @param predict predicted value at the node
    * @param impurity current node impurity
-   * @param isLeaf whether the leaf is a node
-   * @return newed node instance
+   * @param isLeaf whether the node is a leaf
+   * @return new node instance
    */
   def apply(
       nodeIndex: Int,
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -748,6 +748,49 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     assert(topNode.leftNode.get.impurity === 0.0)
     assert(topNode.rightNode.get.impurity === 0.0)
   }
+
+  test("Avoid aggregation if impurity is 0.0") {
+    val arr = new Array[LabeledPoint](4)
+    arr(0) = new LabeledPoint(0.0, Vectors.dense(1.0, 0.0, 0.0))
+    arr(1) = new LabeledPoint(1.0, Vectors.dense(0.0, 1.0, 1.0))
+    arr(2) = new LabeledPoint(0.0, Vectors.dense(2.0, 0.0, 0.0))
+    arr(3) = new LabeledPoint(1.0, Vectors.dense(0.0, 2.0, 1.0))
+    val input = sc.parallelize(arr)
+
+    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 5,
+      numClassesForClassification = 2, categoricalFeaturesInfo = Map(0 -> 3))
+    val metadata = DecisionTreeMetadata.buildMetadata(input, strategy)
+    val (splits, bins) = DecisionTree.findSplitsBins(input, metadata)
+
+    val treeInput = TreePoint.convertToTreeRDD(input, bins, metadata)
+    val baggedInput = BaggedPoint.convertToBaggedRDDWithoutSampling(treeInput)
+
+    val topNode = Node.emptyNode(nodeIndex = 1)
+    assert(topNode.predict.predict === Double.MinValue)
+    assert(topNode.impurity === -1.0)
+    assert(topNode.isLeaf === false)
+
+    val nodesForGroup = Map((0, Array(topNode)))
+    val treeToNodeToIndexInfo = Map((0, Map(
+      (topNode.id, new RandomForest.NodeIndexInfo(0, None))
+    )))
+    val nodeQueue = new mutable.Queue[(Int, Node)]()
+    DecisionTree.findBestSplits(baggedInput, metadata, Array(topNode),
+      nodesForGroup, treeToNodeToIndexInfo, splits, bins, nodeQueue)
+
+    // don't enqueue a node into node queue if its impurity is 0.0
+    assert(nodeQueue.isEmpty)
+
+    // set impurity and predict for topNode
+    assert(topNode.predict.predict !== Double.MinValue)
+    assert(topNode.impurity !== -1.0)
+
+    // set impurity and predict for child nodes
+    assert(topNode.leftNode.get.predict.predict === 0.0)
+    assert(topNode.rightNode.get.predict.predict === 1.0)
+    assert(topNode.leftNode.get.impurity === 0.0)
+    assert(topNode.rightNode.get.impurity === 0.0)
+  }
 }
 
 object DecisionTreeSuite {