diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index f40037a8d9aa..6f4a30d4595a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -145,8 +145,12 @@ class KMeansModel private[ml] ( /** * Return the K-means cost (sum of squared distances of points to their nearest center) for this * model on the given data. + * + * @deprecated This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator + * instead. You can also get the cost on the training dataset in the summary. */ - // TODO: Replace the temp fix when we have proper evaluators defined for clustering. + @deprecated("This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator " + + "instead. You can also get the cost on the training dataset in the summary.", "2.4.0") @Since("2.0.0") def computeCost(dataset: Dataset[_]): Double = { SchemaUtils.validateVectorCompatibleColumn(dataset.schema, getFeaturesCol) @@ -356,7 +360,12 @@ class KMeans @Since("1.5.0") ( val parentModel = algo.run(instances, Option(instr)) val model = copyValues(new KMeansModel(uid, parentModel).setParent(this)) val summary = new KMeansSummary( - model.transform(dataset), $(predictionCol), $(featuresCol), $(k), parentModel.numIter) + model.transform(dataset), + $(predictionCol), + $(featuresCol), + $(k), + parentModel.numIter, + parentModel.trainingCost) model.setSummary(Some(summary)) instr.logNamedValue("clusterSizes", summary.clusterSizes) @@ -389,6 +398,8 @@ object KMeans extends DefaultParamsReadable[KMeans] { * @param featuresCol Name for column of features in `predictions`. * @param k Number of clusters. * @param numIter Number of iterations. + * @param trainingCost K-means cost (sum of squared distances to the nearest centroid for all + * points in the training dataset). This is equivalent to sklearn's inertia. */ @Since("2.0.0") @Experimental @@ -397,4 +408,6 @@ class KMeansSummary private[clustering] ( predictionCol: String, featuresCol: String, k: Int, - numIter: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter) + numIter: Int, + @Since("2.4.0") val trainingCost: Double) + extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 37ae8b1a6171..53d163bab105 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -348,7 +348,7 @@ class KMeans private ( logInfo(s"The cost is $cost.") - new KMeansModel(centers.map(_.vector), distanceMeasure, iteration) + new KMeansModel(centers.map(_.vector), distanceMeasure, cost, iteration) } /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala index e3a88b42fbf7..d5c8188144ce 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala @@ -38,6 +38,7 @@ import org.apache.spark.sql.{Row, SparkSession} @Since("0.8.0") class KMeansModel (@Since("1.0.0") val clusterCenters: Array[Vector], @Since("2.4.0") val distanceMeasure: String, + @Since("2.4.0") val trainingCost: Double, private[spark] val numIter: Int) extends Saveable with Serializable with PMMLExportable { @@ -49,11 +50,11 @@ class KMeansModel (@Since("1.0.0") val clusterCenters: Array[Vector], @Since("2.4.0") private[spark] def this(clusterCenters: Array[Vector], distanceMeasure: String) = - this(clusterCenters: Array[Vector], distanceMeasure, -1) + this(clusterCenters: Array[Vector], distanceMeasure, 0.0, -1) @Since("1.1.0") def this(clusterCenters: Array[Vector]) = - this(clusterCenters: Array[Vector], DistanceMeasure.EUCLIDEAN) + this(clusterCenters: Array[Vector], DistanceMeasure.EUCLIDEAN, 0.0, -1) /** * A Java-friendly constructor that takes an Iterable of Vectors. @@ -187,7 +188,8 @@ object KMeansModel extends Loader[KMeansModel] { val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val metadata = compact(render( ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) - ~ ("k" -> model.k) ~ ("distanceMeasure" -> model.distanceMeasure))) + ~ ("k" -> model.k) ~ ("distanceMeasure" -> model.distanceMeasure) + ~ ("trainingCost" -> model.trainingCost))) sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path)) val dataRDD = sc.parallelize(model.clusterCentersWithNorm.zipWithIndex).map { case (p, id) => Cluster(id, p.vector) @@ -207,7 +209,8 @@ object KMeansModel extends Loader[KMeansModel] { val localCentroids = centroids.rdd.map(Cluster.apply).collect() assert(k == localCentroids.length) val distanceMeasure = (metadata \ "distanceMeasure").extract[String] - new KMeansModel(localCentroids.sortBy(_.id).map(_.point), distanceMeasure) + val trainingCost = (metadata \ "trainingCost").extract[Double] + new KMeansModel(localCentroids.sortBy(_.id).map(_.point), distanceMeasure, trainingCost, -1) } } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala index 829c90fe34e9..9b0b52617755 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala @@ -131,6 +131,8 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes assert(summary.predictions.columns.contains(c)) } assert(summary.cluster.columns === Array(predictionColName)) + assert(summary.trainingCost < 0.1) + assert(model.computeCost(dataset) == summary.trainingCost) val clusterSizes = summary.clusterSizes assert(clusterSizes.length === k) assert(clusterSizes.sum === numRows) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 2f0660040dc7..8a58d838819e 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -16,6 +16,7 @@ # import sys +import warnings from pyspark import since, keyword_only from pyspark.ml.util import * @@ -303,7 +304,15 @@ class KMeansSummary(ClusteringSummary): .. versionadded:: 2.1.0 """ - pass + + @property + @since("2.4.0") + def trainingCost(self): + """ + K-means cost (sum of squared distances to the nearest centroid for all points in the + training dataset). This is equivalent to sklearn's inertia. + """ + return self._call_java("trainingCost") class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): @@ -323,7 +332,13 @@ def computeCost(self, dataset): """ Return the K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. + + ..note:: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead. + You can also get the cost on the training dataset in the summary. """ + warnings.warn("Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator " + "instead. You can also get the cost on the training dataset in the summary.", + DeprecationWarning) return self._call_java("computeCost", dataset) @property @@ -379,6 +394,8 @@ class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol 2 >>> summary.clusterSizes [2, 2] + >>> summary.trainingCost + 2.000... >>> kmeans_path = temp_path + "/kmeans" >>> kmeans.save(kmeans_path) >>> kmeans2 = KMeans.load(kmeans_path)