Updates from matei's review

srowen · srowen · commit ec65502a226c · 2014-04-11T22:16:38.000+01:00
diff --git a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
@@ -39,15 +39,15 @@ private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, conf
 
   override def merge(outputId: Int, taskResult: OpenHashMap[T,Long]) {
     outputsMerged += 1
-    taskResult.foreach{ case (key,value) =>
+    taskResult.foreach { case (key, value) =>
       sums.changeValue(key, value, _ + value)
     }
   }
 
   override def currentResult(): Map[T, BoundedDouble] = {
     if (outputsMerged == totalOutputs) {
       val result = new JHashMap[T, BoundedDouble](sums.size)
-      sums.foreach{ case (key,sum) =>
+      sums.foreach { case (key, sum) =>
         result(key) = new BoundedDouble(sum, 1.0, sum, sum)
       }
       result
@@ -57,7 +57,7 @@ private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, conf
       val p = outputsMerged.toDouble / totalOutputs
       val confFactor = Probability.normalInverse(1 - (1 - confidence) / 2)
       val result = new JHashMap[T, BoundedDouble](sums.size)
-      sums.foreach{ case (key, sum) =>
+      sums.foreach { case (key, sum) =>
         val mean = (sum + 1 - p) / p
         val variance = (sum + 1) * (1 - p) / (p * p)
         val stdev = math.sqrt(variance)
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -836,13 +836,13 @@ abstract class RDD[T: ClassTag](
     // TODO: This should perhaps be distributed by default.
     def countPartition(iter: Iterator[T]): Iterator[OpenHashMap[T,Long]] = {
       val map = new OpenHashMap[T,Long]
-      iter.foreach{
+      iter.foreach {
         t => map.changeValue(t, 1L, _ + 1L)
       }
       Iterator(map)
     }
     def mergeMaps(m1: OpenHashMap[T,Long], m2: OpenHashMap[T,Long]): OpenHashMap[T,Long] = {
-      m2.foreach{ case (key, value) =>
+      m2.foreach { case (key, value) =>
         m1.changeValue(key, value, _ + value)
       }
       m1
@@ -865,7 +865,7 @@ abstract class RDD[T: ClassTag](
     }
     val countPartition: (TaskContext, Iterator[T]) => OpenHashMap[T,Long] = { (ctx, iter) =>
       val map = new OpenHashMap[T,Long]
-      iter.foreach{
+      iter.foreach {
         t => map.changeValue(t, 1L, _ + 1L)
       }
       map
diff --git a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.util.collection
 
 import java.util.{Arrays, Comparator}
+
 import com.google.common.hash.Hashing
 
 import org.apache.spark.annotation.DeveloperApi
diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashMap.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashMap.scala
@@ -30,6 +30,7 @@ import org.apache.spark.annotation.DeveloperApi
  * Under the hood, it uses our OpenHashSet implementation.
  */
 @DeveloperApi
+private[spark]
 class OpenHashMap[K : ClassTag, @specialized(Long, Int, Double) V: ClassTag](
     initialCapacity: Int)
   extends Iterable[(K, V)]
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala
@@ -30,11 +30,30 @@ object RawTextHelper {
    */
   def splitAndCountPartitions(iter: Iterator[String]): Iterator[(String, Long)] = {
     val map = new OpenHashMap[String,Long]
-    val tokenized = iter.flatMap(_.split(" ").filterNot(_.isEmpty))
-    tokenized.foreach{ s =>
-      map.changeValue(s, 1L, _ + 1L)
+    var i = 0
+    var j = 0
+    while (iter.hasNext) {
+      val s = iter.next()
+      i = 0
+      while (i < s.length) {
+        j = i
+        while (j < s.length && s.charAt(j) != ' ') {
+          j += 1
+        }
+        if (j > i) {
+          val w = s.substring(i, j)
+          map.changeValue(w, 1L, _ + 1L)
+        }
+        i = j
+        while (i < s.length && s.charAt(i) == ' ') {
+          i += 1
+        }
+      }
+      map.toIterator.map {
+        case (k, v) => (k, v)
+      }
     }
-    map.iterator
+    map.toIterator.map{case (k, v) => (k, v)}
   }
 
   /**

Original file line number	Diff line number	Diff line change
`@@ -836,13 +836,13 @@ abstract class RDD[T: ClassTag](`
`836`	`836`	`// TODO: This should perhaps be distributed by default.`
`837`	`837`	`def countPartition(iter: Iterator[T]): Iterator[OpenHashMap[T,Long]] = {`
`838`	`838`	`val map = new OpenHashMap[T,Long]`
`839`		`- iter.foreach{`
	`839`	`+ iter.foreach {`
`840`	`840`	`t => map.changeValue(t, 1L, _ + 1L)`
`841`	`841`	`}`
`842`	`842`	`Iterator(map)`
`843`	`843`	`}`
`844`	`844`	`def mergeMaps(m1: OpenHashMap[T,Long], m2: OpenHashMap[T,Long]): OpenHashMap[T,Long] = {`
`845`		`- m2.foreach{ case (key, value) =>`
	`845`	`+ m2.foreach { case (key, value) =>`
`846`	`846`	`m1.changeValue(key, value, _ + value)`
`847`	`847`	`}`
`848`	`848`	`m1`
`@@ -865,7 +865,7 @@ abstract class RDD[T: ClassTag](`
`865`	`865`	`}`
`866`	`866`	`val countPartition: (TaskContext, Iterator[T]) => OpenHashMap[T,Long] = { (ctx, iter) =>`
`867`	`867`	`val map = new OpenHashMap[T,Long]`
`868`		`- iter.foreach{`
	`868`	`+ iter.foreach {`
`869`	`869`	`t => map.changeValue(t, 1L, _ + 1L)`
`870`	`870`	`}`
`871`	`871`	`map`