Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ object MLUtils {
* feature dimensions.
* @param minPartitions min number of partitions
* @return labeled data stored as an RDD[LabeledPoint]
* @since 1.0.0
*/
def loadLibSVMFile(
sc: SparkContext,
Expand Down Expand Up @@ -113,7 +114,10 @@ object MLUtils {
}

// Convenient methods for `loadLibSVMFile`.


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to leave the @deprecated outside the JavaDoc as a Scala annotation to make it work in Scala.

/**
* @since 1.0.0
*/
@deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
def loadLibSVMFile(
sc: SparkContext,
Expand All @@ -126,13 +130,17 @@ object MLUtils {
/**
* Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of
* partitions.
* @since 1.0.0
*/
def loadLibSVMFile(
sc: SparkContext,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We still need to keep this as an annotation. @deprecated in comments is a JavaDoc tag, while @deprecated outside comments is a Scala annotation, which helps generate compiler warnings.

We should keep each PR minimal, so please keep all @deprecated annotation unchanged.

path: String,
numFeatures: Int): RDD[LabeledPoint] =
loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions)

/**
* @since 1.0.0
*/
@deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
def loadLibSVMFile(
sc: SparkContext,
Expand All @@ -141,6 +149,9 @@ object MLUtils {
numFeatures: Int): RDD[LabeledPoint] =
loadLibSVMFile(sc, path, numFeatures)

/**
* @since 1.0.0
*/
@deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
def loadLibSVMFile(
sc: SparkContext,
Expand All @@ -151,6 +162,7 @@ object MLUtils {
/**
* Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], with number of
* features determined automatically and the default number of partitions.
* @since 1.0.0
*/
def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] =
loadLibSVMFile(sc, path, -1)
Expand Down Expand Up @@ -181,12 +193,14 @@ object MLUtils {
* @param path file or directory path in any Hadoop-supported file system URI
* @param minPartitions min number of partitions
* @return vectors stored as an RDD[Vector]
* @since 1.1.0
*/
def loadVectors(sc: SparkContext, path: String, minPartitions: Int): RDD[Vector] =
sc.textFile(path, minPartitions).map(Vectors.parse)

/**
* Loads vectors saved using `RDD[Vector].saveAsTextFile` with the default number of partitions.
* @since 1.1.0
*/
def loadVectors(sc: SparkContext, path: String): RDD[Vector] =
sc.textFile(path, sc.defaultMinPartitions).map(Vectors.parse)
Expand All @@ -197,13 +211,15 @@ object MLUtils {
* @param path file or directory path in any Hadoop-supported file system URI
* @param minPartitions min number of partitions
* @return labeled points stored as an RDD[LabeledPoint]
* @since 1.1.0
*/
def loadLabeledPoints(sc: SparkContext, path: String, minPartitions: Int): RDD[LabeledPoint] =
sc.textFile(path, minPartitions).map(LabeledPoint.parse)

/**
* Loads labeled points saved using `RDD[LabeledPoint].saveAsTextFile` with the default number of
* partitions.
* @since 1.1.0
*/
def loadLabeledPoints(sc: SparkContext, dir: String): RDD[LabeledPoint] =
loadLabeledPoints(sc, dir, sc.defaultMinPartitions)
Expand All @@ -220,6 +236,7 @@ object MLUtils {
*
* @deprecated Should use [[org.apache.spark.rdd.RDD#saveAsTextFile]] for saving and
* [[org.apache.spark.mllib.util.MLUtils#loadLabeledPoints]] for loading.
* @since 1.0.0
*/
@deprecated("Should use MLUtils.loadLabeledPoints instead.", "1.0.1")
def loadLabeledData(sc: SparkContext, dir: String): RDD[LabeledPoint] = {
Expand All @@ -241,6 +258,7 @@ object MLUtils {
*
* @deprecated Should use [[org.apache.spark.rdd.RDD#saveAsTextFile]] for saving and
* [[org.apache.spark.mllib.util.MLUtils#loadLabeledPoints]] for loading.
* @since 1.0.0
*/
@deprecated("Should use RDD[LabeledPoint].saveAsTextFile instead.", "1.0.1")
def saveLabeledData(data: RDD[LabeledPoint], dir: String) {
Expand All @@ -253,6 +271,7 @@ object MLUtils {
* Return a k element array of pairs of RDDs with the first element of each pair
* containing the training data, a complement of the validation data and the second
* element, the validation data, containing a unique 1/kth of the data. Where k=numFolds.
* @since 1.0.0
*/
@Experimental
def kFold[T: ClassTag](rdd: RDD[T], numFolds: Int, seed: Int): Array[(RDD[T], RDD[T])] = {
Expand All @@ -268,6 +287,7 @@ object MLUtils {

/**
* Returns a new vector with `1.0` (bias) appended to the input vector.
* @since 1.0.0
*/
def appendBias(vector: Vector): Vector = {
vector match {
Expand Down