From 7a0829f99fa9f7f261362a182caecad171d5ab78 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Thu, 16 Feb 2017 14:38:30 -0800 Subject: [PATCH 1/3] linearsvc doc and example --- docs/ml-classification-regression.md | 45 ++++++++++++++++ .../examples/ml/JavaLinearSVCExample.java | 54 +++++++++++++++++++ examples/src/main/python/ml/linearsvc.py | 46 ++++++++++++++++ examples/src/main/r/ml/linearSVC.R | 41 ++++++++++++++ .../spark/examples/ml/LinearSVCExample.scala | 52 ++++++++++++++++++ 5 files changed, 238 insertions(+) create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaLinearSVCExample.java create mode 100644 examples/src/main/python/ml/linearsvc.py create mode 100644 examples/src/main/r/ml/linearSVC.R create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/LinearSVCExample.scala diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index 782ee5818893..ed8c59d6e8f1 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -363,6 +363,51 @@ Refer to the [R API docs](api/R/spark.mlp.html) for more details. +## Linear Support Vector Machine + +A [support vector machine](https://en.wikipedia.org/wiki/Support_vector_machine) constructs a hyperplane +or set of hyperplanes in a high- or infinite-dimensional space, which can be used for classification, +regression, or other tasks. Intuitively, a good separation is achieved by the hyperplane that has +the largest distance to the nearest training-data point of any class (so-called functional margin), +since in general the larger the margin the lower the generalization error of the classifier. LinearSVC +in Spark ML supports binary calssification with linear SVM. Internally, it optimizes the +[Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss) using OWLQN optimizer. + + +**Examples** + +
+ +
+ +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.LinearSVC) for more details. + +{% include_example scala/org/apache/spark/examples/ml/LinearSVCExample.scala %} +
+ +
+ +Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/LinearSVC.html) for more details. + +{% include_example java/org/apache/spark/examples/ml/JavaLinearSVCExample.java %} +
+ +
+ +Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.LinearSVC) for more details. + +{% include_example python/ml/linearsvc.py %} +
+ +
+ +Refer to the [R API docs](api/R/spark.linearSVC.html) for more details. + +{% include_example r/ml/linearSVC.R %} +
+ +
+ ## One-vs-Rest classifier (a.k.a. One-vs-All) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearSVCExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearSVCExample.java new file mode 100644 index 000000000000..ef85a1c5389b --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearSVCExample.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +// $example on$ +import org.apache.spark.ml.classification.LinearSVC; +import org.apache.spark.ml.classification.LinearSVCModel; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +// $example off$ + +public class JavaLinearSVCExample { + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .appName("JavaLinearSVCExample") + .getOrCreate(); + + // $example on$ + // Load training data + Dataset training = spark.read().format("libsvm") + .load("data/mllib/sample_libsvm_data.txt"); + + LinearSVC lsvc = new LinearSVC() + .setMaxIter(10) + .setRegParam(0.3); + + // Fit the model + LinearSVCModel lsvcModel = lsvc.fit(training); + + // Print the coefficients and intercept for LinearSVC + System.out.println("Coefficients: " + + lsvcModel.coefficients() + " Intercept: " + lsvcModel.intercept()); + // $example off$ + + spark.stop(); + } +} diff --git a/examples/src/main/python/ml/linearsvc.py b/examples/src/main/python/ml/linearsvc.py new file mode 100644 index 000000000000..18cbf87a1069 --- /dev/null +++ b/examples/src/main/python/ml/linearsvc.py @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +# $example on$ +from pyspark.ml.classification import LinearSVC +# $example off$ +from pyspark.sql import SparkSession + +if __name__ == "__main__": + spark = SparkSession\ + .builder\ + .appName("linearSVC Example")\ + .getOrCreate() + + # $example on$ + # Load training data + training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + + lsvc = LinearSVC(maxIter=10, regParam=0.1) + + # Fit the model + lsvcModel = lsvc.fit(training) + + # Print the coefficients and intercept for linearsSVC + print("Coefficients: " + str(lsvcModel.coefficients)) + print("Intercept: " + str(lsvcModel.intercept)) + + # $example off$ + + spark.stop() diff --git a/examples/src/main/r/ml/linearSVC.R b/examples/src/main/r/ml/linearSVC.R new file mode 100644 index 000000000000..c1435a296dd4 --- /dev/null +++ b/examples/src/main/r/ml/linearSVC.R @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/linearSVC.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-linearSVC-example") + +# Linear Support Vector Machine Classifier + +# $example on$ +# Load training data +df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit an binomial linearSVC model with spark.linearSVC +model <- spark.linearSVC(training, label ~ features, maxIter = 10, regParam = 0.1) + +# Prediction +predictions <- predict(model, test) +showDF(predictions) +# $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearSVCExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearSVCExample.scala new file mode 100644 index 000000000000..5f43e65712b5 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearSVCExample.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.classification.LinearSVC +// $example off$ +import org.apache.spark.sql.SparkSession + +object LinearSVCExample { + + def main(args: Array[String]): Unit = { + val spark = SparkSession + .builder + .appName("LinearSVCExample") + .getOrCreate() + + // $example on$ + // Load training data + val training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + + val lsvc = new LinearSVC() + .setMaxIter(10) + .setRegParam(0.1) + + // Fit the model + val lsvcModel = lsvc.fit(training) + + // Print the coefficients and intercept for linear svc + println(s"Coefficients: ${lsvcModel.coefficients} Intercept: ${lsvcModel.intercept}") + // $example off$ + + spark.stop() + } +} +// scalastyle:on println From b888f35372532e0766839068e0827454afed10aa Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Fri, 17 Feb 2017 12:23:57 -0800 Subject: [PATCH 2/3] remove r example and some spell correction --- docs/ml-classification-regression.md | 11 +---- .../examples/ml/JavaLinearSVCExample.java | 2 +- examples/src/main/r/ml/linearSVC.R | 41 ------------------- 3 files changed, 3 insertions(+), 51 deletions(-) delete mode 100644 examples/src/main/r/ml/linearSVC.R diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index ed8c59d6e8f1..bc612910722e 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -368,9 +368,9 @@ Refer to the [R API docs](api/R/spark.mlp.html) for more details. A [support vector machine](https://en.wikipedia.org/wiki/Support_vector_machine) constructs a hyperplane or set of hyperplanes in a high- or infinite-dimensional space, which can be used for classification, regression, or other tasks. Intuitively, a good separation is achieved by the hyperplane that has -the largest distance to the nearest training-data point of any class (so-called functional margin), +the largest distance to the nearest training-data points of any class (so-called functional margin), since in general the larger the margin the lower the generalization error of the classifier. LinearSVC -in Spark ML supports binary calssification with linear SVM. Internally, it optimizes the +in Spark ML supports binomial classification with linear SVM. Internally, it optimizes the [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss) using OWLQN optimizer. @@ -399,13 +399,6 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat {% include_example python/ml/linearsvc.py %} -
- -Refer to the [R API docs](api/R/spark.linearSVC.html) for more details. - -{% include_example r/ml/linearSVC.R %} -
- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearSVCExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearSVCExample.java index ef85a1c5389b..a18ed1d0b48f 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearSVCExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearSVCExample.java @@ -39,7 +39,7 @@ public static void main(String[] args) { LinearSVC lsvc = new LinearSVC() .setMaxIter(10) - .setRegParam(0.3); + .setRegParam(0.1); // Fit the model LinearSVCModel lsvcModel = lsvc.fit(training); diff --git a/examples/src/main/r/ml/linearSVC.R b/examples/src/main/r/ml/linearSVC.R deleted file mode 100644 index c1435a296dd4..000000000000 --- a/examples/src/main/r/ml/linearSVC.R +++ /dev/null @@ -1,41 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# To run this example use -# ./bin/spark-submit examples/src/main/r/ml/linearSVC.R - -# Load SparkR library into your R session -library(SparkR) - -# Initialize SparkSession -sparkR.session(appName = "SparkR-ML-linearSVC-example") - -# Linear Support Vector Machine Classifier - -# $example on$ -# Load training data -df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm") -training <- df -test <- df - -# Fit an binomial linearSVC model with spark.linearSVC -model <- spark.linearSVC(training, label ~ features, maxIter = 10, regParam = 0.1) - -# Prediction -predictions <- predict(model, test) -showDF(predictions) -# $example off$ From 165fbe430e691124a87dde9862df7b3ba3e3d4a2 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Sun, 19 Feb 2017 11:19:23 -0800 Subject: [PATCH 3/3] change to binary and add r --- docs/ml-classification-regression.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index bc612910722e..37862f82c338 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -370,7 +370,7 @@ or set of hyperplanes in a high- or infinite-dimensional space, which can be use regression, or other tasks. Intuitively, a good separation is achieved by the hyperplane that has the largest distance to the nearest training-data points of any class (so-called functional margin), since in general the larger the margin the lower the generalization error of the classifier. LinearSVC -in Spark ML supports binomial classification with linear SVM. Internally, it optimizes the +in Spark ML supports binary classification with linear SVM. Internally, it optimizes the [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss) using OWLQN optimizer. @@ -399,8 +399,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat {% include_example python/ml/linearsvc.py %} +
+ +Refer to the [R API docs](api/R/spark.svmLinear.html) for more details. + +{% include_example r/ml/svmLinear.R %}
+ ## One-vs-Rest classifier (a.k.a. One-vs-All)