tdas
diff --git a/‎examples/src/main/python/als.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/src/main/python/als.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/src/main/python/cassandra_inputformat.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/src/main/python/cassandra_inputformat.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/src/main/python/cassandra_outputformat.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/src/main/python/cassandra_outputformat.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/src/main/python/hbase_inputformat.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/src/main/python/hbase_inputformat.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/src/main/python/hbase_outputformat.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/src/main/python/hbase_outputformat.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/src/main/python/kmeans.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/src/main/python/kmeans.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/src/main/python/logistic_regression.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/src/main/python/logistic_regression.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/src/main/python/mllib/correlations.py‎
Lines changed: 60 additions & 0 deletions b/‎examples/src/main/python/mllib/correlations.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎examples/src/main/python/mllib/decision_tree_runner.py‎
Lines changed: 5 additions & 0 deletions b/‎examples/src/main/python/mllib/decision_tree_runner.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/src/main/python/mllib/kmeans.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/src/main/python/mllib/kmeans.py‎
Lines changed: 1 addition & 0 deletions
@@ -97,3 +97,5 @@ def update(i, vec, mat, ratings):
         error = rmse(R, ms, us)
         print "Iteration %d:" % i
         print "\nRMSE: %5.4f\n" % error
+
+    sc.stop()
@@ -77,3 +77,5 @@
     output = cass_rdd.collect()
     for (k, v) in output:
         print (k, v)
+
+    sc.stop()
@@ -81,3 +81,5 @@
         conf=conf,
         keyConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLKeyConverter",
         valueConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLValueConverter")
+
+    sc.stop()
@@ -71,3 +71,5 @@
     output = hbase_rdd.collect()
     for (k, v) in output:
         print (k, v)
+
+    sc.stop()
@@ -63,3 +63,5 @@
         conf=conf,
         keyConverter="org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter",
         valueConverter="org.apache.spark.examples.pythonconverters.StringListToPutConverter")
+
+    sc.stop()
@@ -77,3 +77,5 @@ def closestPoint(p, centers):
             kPoints[x] = y
 
     print "Final centers: " + str(kPoints)
+
+    sc.stop()
@@ -80,3 +80,5 @@ def add(x, y):
         w -= points.map(lambda m: gradient(m, w)).reduce(add)
 
     print "Final w: " + str(w)
+
+    sc.stop()
@@ -0,0 +1,60 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Correlations using MLlib.
+"""
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.stat import Statistics
+from pyspark.mllib.util import MLUtils
+
+
+if __name__ == "__main__":
+    if len(sys.argv) not in [1,2]:
+        print >> sys.stderr, "Usage: correlations (<file>)"
+        exit(-1)
+    sc = SparkContext(appName="PythonCorrelations")
+    if len(sys.argv) == 2:
+        filepath = sys.argv[1]
+    else:
+        filepath = 'data/mllib/sample_linear_regression_data.txt'
+    corrType = 'pearson'
+
+    points = MLUtils.loadLibSVMFile(sc, filepath)\
+        .map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
+
+    print
+    print 'Summary of data file: ' + filepath
+    print '%d data points' % points.count()
+
+    # Statistics (correlations)
+    print
+    print 'Correlation (%s) between label and each feature' % corrType
+    print 'Feature\tCorrelation'
+    numFeatures = points.take(1)[0].features.size
+    labelRDD = points.map(lambda lp: lp.label)
+    for i in range(numFeatures):
+        featureRDD = points.map(lambda lp: lp.features[i])
+        corr = Statistics.corr(labelRDD, featureRDD, corrType)
+        print '%d\t%g' % (i, corr)
+    print
+
+    sc.stop()
@@ -17,6 +17,8 @@
 
 """
 Decision tree classification and regression using MLlib.
+
+This example requires NumPy (http://www.numpy.org/).
 """
 
 import numpy, os, sys
@@ -117,6 +119,7 @@ def usage():
     if len(sys.argv) == 2:
         dataPath = sys.argv[1]
     if not os.path.isfile(dataPath):
+        sc.stop()
         usage()
     points = MLUtils.loadLibSVMFile(sc, dataPath)
 
@@ -133,3 +136,5 @@ def usage():
     print "  Model depth: %d\n" % model.depth()
     print "  Training accuracy: %g\n" % getAccuracy(model, reindexedData)
     print model
+
+    sc.stop()
@@ -42,3 +42,4 @@ def parseVector(line):
     k = int(sys.argv[2])
     model = KMeans.train(data, k)
     print "Final centers: " + str(model.clusterCenters)
+    sc.stop()