@@ -19,8 +19,6 @@ package org.apache.spark.mllib.clustering
1919
2020import scala .collection .mutable .ArrayBuffer
2121
22- import breeze .linalg .{DenseVector => BDV , Vector => BV }
23-
2422import org .apache .spark .annotation .Experimental
2523import org .apache .spark .Logging
2624import org .apache .spark .SparkContext ._
@@ -127,10 +125,10 @@ class KMeans private (
127125 // Compute squared norms and cache them.
128126 val norms = data.map(Vectors .norm(_, 2.0 ))
129127 norms.persist()
130- val breezeData = data.map(_.toBreeze) .zip(norms).map { case (v, norm) =>
131- new BreezeVectorWithNorm (v, norm)
128+ val zippedData = data.zip(norms).map { case (v, norm) =>
129+ new VectorWithNorm (v, norm)
132130 }
133- val model = runBreeze(breezeData )
131+ val model = runAlgorithm(zippedData )
134132 norms.unpersist()
135133
136134 // Warn at the end of the run as well, for increased visibility.
@@ -142,9 +140,9 @@ class KMeans private (
142140 }
143141
144142 /**
145- * Implementation of K-Means using breeze .
143+ * Implementation of K-Means algorithm .
146144 */
147- private def runBreeze (data : RDD [BreezeVectorWithNorm ]): KMeansModel = {
145+ private def runAlgorithm (data : RDD [VectorWithNorm ]): KMeansModel = {
148146
149147 val sc = data.sparkContext
150148
@@ -170,9 +168,18 @@ class KMeans private (
170168
171169 // Execute iterations of Lloyd's algorithm until all runs have converged
172170 while (iteration < maxIterations && ! activeRuns.isEmpty) {
173- type WeightedPoint = (BV [Double ], Long )
171+ type WeightedPoint = (Array [Double ], Long )
174172 def mergeContribs (p1 : WeightedPoint , p2 : WeightedPoint ): WeightedPoint = {
175- (p1._1 += p2._1, p1._2 + p2._2)
173+ val v1 = p1._1
174+ val v2 = p2._1
175+ require(v1.size == v2.size)
176+ val size = v1.size
177+ var i = 0
178+ while (i < size) {
179+ v1(i) += v2(i)
180+ i += 1
181+ }
182+ (v1, p1._2 + p2._2)
176183 }
177184
178185 val activeCenters = activeRuns.map(r => centers(r)).toArray
@@ -185,16 +192,17 @@ class KMeans private (
185192 val thisActiveCenters = bcActiveCenters.value
186193 val runs = thisActiveCenters.length
187194 val k = thisActiveCenters(0 ).length
188- val dims = thisActiveCenters(0 )(0 ).vector.length
195+ val dims = thisActiveCenters(0 )(0 ).vector.size
189196
190- val sums = Array .fill(runs, k)(BDV .zeros [Double ](dims). asInstanceOf [ BV [ Double ]] )
197+ val sums = Array .fill(runs, k)(Array .ofDim [Double ](dims))
191198 val counts = Array .fill(runs, k)(0L )
192199
193200 points.foreach { point =>
194201 (0 until runs).foreach { i =>
195202 val (bestCenter, cost) = KMeans .findClosest(thisActiveCenters(i), point)
196203 costAccums(i) += cost
197- sums(i)(bestCenter) += point.vector
204+ val sum = sums(i)(bestCenter)
205+ point.vector.foreachActive((index, value) => sum(index) += value)
198206 counts(i)(bestCenter) += 1
199207 }
200208 }
@@ -212,8 +220,12 @@ class KMeans private (
212220 while (j < k) {
213221 val (sum, count) = totalContribs((i, j))
214222 if (count != 0 ) {
215- sum /= count.toDouble
216- val newCenter = new BreezeVectorWithNorm (sum)
223+ var i = 0
224+ while (i < sum.size) {
225+ sum(i) /= count
226+ i += 1
227+ }
228+ val newCenter = new VectorWithNorm (sum)
217229 if (KMeans .fastSquaredDistance(newCenter, centers(run)(j)) > epsilon * epsilon) {
218230 changed = true
219231 }
@@ -245,18 +257,18 @@ class KMeans private (
245257
246258 logInfo(s " The cost for the best run is $minCost. " )
247259
248- new KMeansModel (centers(bestRun).map(c => Vectors .fromBreeze( c.vector) ))
260+ new KMeansModel (centers(bestRun).map(c => c.vector))
249261 }
250262
251263 /**
252264 * Initialize `runs` sets of cluster centers at random.
253265 */
254- private def initRandom (data : RDD [BreezeVectorWithNorm ])
255- : Array [Array [BreezeVectorWithNorm ]] = {
266+ private def initRandom (data : RDD [VectorWithNorm ])
267+ : Array [Array [VectorWithNorm ]] = {
256268 // Sample all the cluster centers in one pass to avoid repeated scans
257269 val sample = data.takeSample(true , runs * k, new XORShiftRandom ().nextInt()).toSeq
258270 Array .tabulate(runs)(r => sample.slice(r * k, (r + 1 ) * k).map { v =>
259- new BreezeVectorWithNorm ( v.vector.toDenseVector , v.norm)
271+ new VectorWithNorm ( Vectors .dense( v.vector.toArray) , v.norm)
260272 }.toArray)
261273 }
262274
@@ -269,8 +281,8 @@ class KMeans private (
269281 *
270282 * The original paper can be found at http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf.
271283 */
272- private def initKMeansParallel (data : RDD [BreezeVectorWithNorm ])
273- : Array [Array [BreezeVectorWithNorm ]] = {
284+ private def initKMeansParallel (data : RDD [VectorWithNorm ])
285+ : Array [Array [VectorWithNorm ]] = {
274286 // Initialize each run's center to a random point
275287 val seed = new XORShiftRandom ().nextInt()
276288 val sample = data.takeSample(true , runs, seed).toSeq
@@ -376,8 +388,8 @@ object KMeans {
376388 * Returns the index of the closest center to the given point, as well as the squared distance.
377389 */
378390 private [mllib] def findClosest (
379- centers : TraversableOnce [BreezeVectorWithNorm ],
380- point : BreezeVectorWithNorm ): (Int , Double ) = {
391+ centers : TraversableOnce [VectorWithNorm ],
392+ point : VectorWithNorm ): (Int , Double ) = {
381393 var bestDistance = Double .PositiveInfinity
382394 var bestIndex = 0
383395 var i = 0
@@ -402,35 +414,33 @@ object KMeans {
402414 * Returns the K-means cost of a given point against the given cluster centers.
403415 */
404416 private [mllib] def pointCost (
405- centers : TraversableOnce [BreezeVectorWithNorm ],
406- point : BreezeVectorWithNorm ): Double =
417+ centers : TraversableOnce [VectorWithNorm ],
418+ point : VectorWithNorm ): Double =
407419 findClosest(centers, point)._2
408420
409421 /**
410422 * Returns the squared Euclidean distance between two vectors computed by
411423 * [[org.apache.spark.mllib.util.MLUtils#fastSquaredDistance ]].
412424 */
413425 private [clustering] def fastSquaredDistance (
414- v1 : BreezeVectorWithNorm ,
415- v2 : BreezeVectorWithNorm ): Double = {
426+ v1 : VectorWithNorm ,
427+ v2 : VectorWithNorm ): Double = {
416428 MLUtils .fastSquaredDistance(v1.vector, v1.norm, v2.vector, v2.norm)
417429 }
418430}
419431
420432/**
421- * A breeze vector with its norm for fast distance computation.
433+ * A vector with its norm for fast distance computation.
422434 *
423435 * @see [[org.apache.spark.mllib.clustering.KMeans#fastSquaredDistance ]]
424436 */
425437private [clustering]
426- class BreezeVectorWithNorm (val vector : BV [Double ], val norm : Double ) extends Serializable {
427-
428- def this (vector : BV [Double ]) = this (vector, Vectors .norm(Vectors .fromBreeze(vector), 2.0 ))
438+ class VectorWithNorm (val vector : Vector , val norm : Double ) extends Serializable {
429439
430- def this (array : Array [ Double ] ) = this (new BDV [ Double ](array ))
440+ def this (vector : Vector ) = this (vector, Vectors .norm(vector, 2.0 ))
431441
432- def this (v : Vector ) = this (v.toBreeze )
442+ def this (array : Array [ Double ] ) = this (Vectors .dense(array) )
433443
434444 /** Converts the vector to a dense vector. */
435- def toDense = new BreezeVectorWithNorm ( vector.toDenseVector , norm)
445+ def toDense = new VectorWithNorm ( Vectors .dense( vector.toArray) , norm)
436446}
0 commit comments