@@ -1133,6 +1133,7 @@ val scaledData = scalerModel.transform(dataFrame)
11331133{% highlight java %}
11341134import org.apache.spark.api.java.JavaRDD;
11351135import org.apache.spark.ml.feature.StandardScaler;
1136+ import org.apache.spark.ml.feature.StandardScalerModel;
11361137import org.apache.spark.mllib.regression.LabeledPoint;
11371138import org.apache.spark.mllib.util.MLUtils;
11381139import org.apache.spark.sql.DataFrame;
@@ -1173,6 +1174,76 @@ scaledData = scalerModel.transform(dataFrame)
11731174</div >
11741175</div >
11751176
1177+ ## MinMaxScaler
1178+
1179+ ` MinMaxScaler ` transforms a dataset of ` Vector ` rows, rescaling each feature to a specific range (often [ 0, 1] ). It takes parameters:
1180+
1181+ * ` min ` : 0.0 by default. Lower bound after transformation, shared by all features.
1182+ * ` max ` : 1.0 by default. Upper bound after transformation, shared by all features.
1183+
1184+ ` MinMaxScaler ` computes summary statistics on a data set and produces a ` MinMaxScalerModel ` . The model can then transform each feature individually such that it is in the given range.
1185+
1186+ The rescaled value for a feature E is calculated as,
1187+ `\begin{equation}
1188+ Rescaled(e_i) = \frac{e_i - E_ {min}}{E_ {max} - E_ {min}} * (max - min) + min
1189+ \end{equation}`
1190+ For the case ` E_{max} == E_{min} ` , ` Rescaled(e_i) = 0.5 * (max + min) `
1191+
1192+ Note that since zero values will probably be transformed to non-zero values, output of the transformer will be DenseVector even for sparse input.
1193+
1194+ The following example demonstrates how to load a dataset in libsvm format and then rescale each feature to [ 0, 1] .
1195+
1196+ <div class =" codetabs " >
1197+ <div data-lang =" scala " markdown =" 1 " >
1198+ More details can be found in the API docs for
1199+ [ MinMaxScaler] ( api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler ) and
1200+ [ MinMaxScalerModel] ( api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel ) .
1201+ {% highlight scala %}
1202+ import org.apache.spark.ml.feature.MinMaxScaler
1203+ import org.apache.spark.mllib.util.MLUtils
1204+
1205+ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
1206+ val dataFrame = sqlContext.createDataFrame(data)
1207+ val scaler = new MinMaxScaler()
1208+ .setInputCol("features")
1209+ .setOutputCol("scaledFeatures")
1210+
1211+ // Compute summary statistics and generate MinMaxScalerModel
1212+ val scalerModel = scaler.fit(dataFrame)
1213+
1214+ // rescale each feature to range [ min, max] .
1215+ val scaledData = scalerModel.transform(dataFrame)
1216+ {% endhighlight %}
1217+ </div >
1218+
1219+ <div data-lang =" java " markdown =" 1 " >
1220+ More details can be found in the API docs for
1221+ [ MinMaxScaler] ( api/java/org/apache/spark/ml/feature/MinMaxScaler.html ) and
1222+ [ MinMaxScalerModel] ( api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html ) .
1223+ {% highlight java %}
1224+ import org.apache.spark.api.java.JavaRDD;
1225+ import org.apache.spark.ml.feature.MinMaxScaler;
1226+ import org.apache.spark.ml.feature.MinMaxScalerModel;
1227+ import org.apache.spark.mllib.regression.LabeledPoint;
1228+ import org.apache.spark.mllib.util.MLUtils;
1229+ import org.apache.spark.sql.DataFrame;
1230+
1231+ JavaRDD<LabeledPoint > data =
1232+ MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
1233+ DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
1234+ MinMaxScaler scaler = new MinMaxScaler()
1235+ .setInputCol("features")
1236+ .setOutputCol("scaledFeatures");
1237+
1238+ // Compute summary statistics and generate MinMaxScalerModel
1239+ MinMaxScalerModel scalerModel = scaler.fit(dataFrame);
1240+
1241+ // rescale each feature to range [ min, max] .
1242+ DataFrame scaledData = scalerModel.transform(dataFrame);
1243+ {% endhighlight %}
1244+ </div >
1245+ </div >
1246+
11761247## Bucketizer
11771248
11781249` Bucketizer ` transforms a column of continuous features to a column of feature buckets, where the buckets are specified by users. It takes a parameter:
0 commit comments