@@ -21,6 +21,14 @@ import breeze.linalg.{Vector => BV}
2121import org .apache .spark .mllib .linalg .{Vector , Vectors }
2222import org .apache .spark .rdd .RDD
2323
24+ case class VectorRDDStatisticalSummary (
25+ mean : Vector ,
26+ variance : Vector ,
27+ count : Long ,
28+ max : Vector ,
29+ min : Vector ,
30+ nonZeroCnt : Vector ) extends Serializable
31+
2432/**
2533 * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector ]] through an
2634 * implicit conversion. Import `org.apache.spark.MLContext._` at the top of your program to use
@@ -40,7 +48,7 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable {
4048 * }}},
4149 * with the size of Vector as input parameter.
4250 */
43- def statistics (size : Int ): ( Vector , Vector , Double , Vector , Vector , Vector ) = {
51+ def summarizeStatistics (size : Int ): VectorRDDStatisticalSummary = {
4452 val results = self.map(_.toBreeze).aggregate((
4553 BV .zeros[Double ](size),
4654 BV .zeros[Double ](size),
@@ -83,9 +91,10 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable {
8391 }
8492 )
8593
86- (Vectors .fromBreeze(results._1),
94+ VectorRDDStatisticalSummary (
95+ Vectors .fromBreeze(results._1),
8796 Vectors .fromBreeze(results._2 :/ results._3),
88- results._3,
97+ results._3.toLong ,
8998 Vectors .fromBreeze(results._4),
9099 Vectors .fromBreeze(results._5),
91100 Vectors .fromBreeze(results._6))
0 commit comments