Skip to content

Commit f779561

Browse files
committed
[SPARK-7328] Pyspark.mllib.linalg.Vectors: Missing items
1 parent 32cdc81 commit f779561

File tree

2 files changed

+194
-8
lines changed

2 files changed

+194
-8
lines changed

python/pyspark/mllib/linalg.py

Lines changed: 168 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
import sys
2727
import array
28+
from math import sqrt
2829

2930
if sys.version >= '3':
3031
basestring = str
@@ -208,9 +209,55 @@ def __init__(self, ar):
208209
ar = ar.astype(np.float64)
209210
self.array = ar
210211

212+
def toString(self):
213+
"""
214+
Convert DenseVector to string representation.
215+
216+
>>> a = DenseVector([0, 1, 2, 3])
217+
>>> a.toString()
218+
'[0.0,1.0,2.0,3.0]'
219+
"""
220+
return str(self)
221+
222+
def copy(self):
223+
return DenseVector(np.copy(self.array))
224+
225+
@staticmethod
226+
def parse(vectorString):
227+
"""
228+
Parse string representation back into the DenseVector.
229+
230+
>>> DenseVector.parse('[0.0,1.0,2.0,3.0]')
231+
DenseVector([0.0, 1.0, 2.0, 3.0])
232+
"""
233+
vectorString = vectorString[1:-1]
234+
return DenseVector([float(val) for val in vectorString.split(',')])
235+
211236
def __reduce__(self):
212237
return DenseVector, (self.array.tostring(),)
213238

239+
def numNonzeros(self):
240+
return np.nonzero(self.array)[0].size
241+
242+
def norm(self, p):
243+
"""
244+
Calculte the norm of a DenseVector.
245+
246+
>>> a = DenseVector([0, -1, 2, -3])
247+
>>> a.norm(2)
248+
3.7...
249+
>>> a.norm(1)
250+
6.0
251+
"""
252+
if p == 1:
253+
return np.sum(np.abs(self.array))
254+
elif p == 2:
255+
return sqrt(np.dot(self.array, self.array))
256+
elif p == np.inf:
257+
return np.max(np.abs(self.array))
258+
else:
259+
return pow(np.power(self.array, p), 1.0 / p)
260+
214261
def dot(self, other):
215262
"""
216263
Compute the dot product of two Vectors. We support
@@ -387,9 +434,60 @@ def __init__(self, size, *args):
387434
if self.indices[i] >= self.indices[i + 1]:
388435
raise TypeError("indices array must be sorted")
389436

437+
def copy(self):
438+
return SparseVector(self.size, np.copy(self.indices), np.copy(self.values))
439+
440+
def numNonzeros(self):
441+
return np.nonzero(self.values)[0].size
442+
443+
def norm(self, p):
444+
"""
445+
Calculte the norm of a SparseVector.
446+
447+
>>> a = SparseVector(4, [0, 1], [3., -4.])
448+
>>> a.norm(1)
449+
7.0
450+
>>> a.norm(2)
451+
5.0
452+
"""
453+
if p == 1:
454+
return np.sum(np.abs(self.values))
455+
elif p == 2:
456+
return sqrt(np.dot(self.values, self.values))
457+
elif p == np.inf:
458+
return np.max(np.abs(self.values))
459+
else:
460+
return pow(np.power(self.values, p), 1.0 / p)
461+
390462
def __reduce__(self):
391463
return (SparseVector, (self.size, self.indices.tostring(), self.values.tostring()))
392464

465+
def toString(self):
466+
"""
467+
Convert SparseVector to string representation.
468+
469+
>>> a = SparseVector(4, [0, 1], [4, 5])
470+
>>> a.toString()
471+
'(4,[0,1],[4.0,5.0])'
472+
"""
473+
return str(self)
474+
475+
@staticmethod
476+
def parse(vectorString):
477+
"""
478+
Parse string representation back into the DenseVector.
479+
480+
>>> SparseVector.parse('(4,[0,1],[4.0,5.0])')
481+
SparseVector(4, {0: 4.0, 1: 5.0})
482+
"""
483+
size = int(vectorString[1])
484+
ind_end = vectorString.find(']')
485+
index_string = vectorString[4: ind_end]
486+
indices = [int(ind) for ind in index_string.split(',')]
487+
value_string = vectorString[ind_end + 3: -2]
488+
values = [float(val) for val in value_string.split(',')]
489+
return SparseVector(size, indices, values)
490+
393491
def dot(self, other):
394492
"""
395493
Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
@@ -430,12 +528,15 @@ def dot(self, other):
430528

431529
assert len(self) == _vector_size(other), "dimension mismatch"
432530

433-
if type(other) in (np.ndarray, array.array, DenseVector):
531+
if type(other) in (np.ndarray, array.array):
434532
result = 0.0
435-
for i in xrange(len(self.indices)):
436-
result += self.values[i] * other[self.indices[i]]
533+
for i, ind in enumerate(self.indices):
534+
result += self.values[i] * other[ind]
437535
return result
438536

537+
elif isinstance(other, DenseVector):
538+
return np.dot(other.toArray()[self.indices], self.values)
539+
439540
elif type(other) is SparseVector:
440541
result = 0.0
441542
i, j = 0, 0
@@ -479,19 +580,28 @@ def squared_distance(self, other):
479580
AssertionError: dimension mismatch
480581
"""
481582
assert len(self) == _vector_size(other), "dimension mismatch"
482-
if type(other) in (list, array.array, DenseVector, np.array, np.ndarray):
583+
if type(other) in (list, array.array, np.array, np.ndarray):
483584
if type(other) is np.array and other.ndim != 1:
484585
raise Exception("Cannot call squared_distance with %d-dimensional array" %
485586
other.ndim)
486587
result = 0.0
487588
j = 0 # index into our own array
488-
for i in xrange(len(other)):
589+
for i, other_ind in enumerate(other):
489590
if j < len(self.indices) and self.indices[j] == i:
490-
diff = self.values[j] - other[i]
591+
diff = self.values[j] - other_ind
491592
result += diff * diff
492593
j += 1
493594
else:
494-
result += other[i] * other[i]
595+
result += other_ind * other_ind
596+
return result
597+
598+
elif isinstance(other, DenseVector):
599+
bool_ind = np.zeros(len(other), dtype=bool)
600+
bool_ind[self.indices] = True
601+
dist = other.toArray()[bool_ind] - self.values
602+
result = np.dot(dist, dist)
603+
other_values = other.toArray()[~bool_ind]
604+
result += np.dot(other_values, other_values)
495605
return result
496606

497607
elif type(other) is SparseVector:
@@ -633,6 +743,57 @@ def stringify(vector):
633743
"""
634744
return str(vector)
635745

746+
@staticmethod
747+
def dot(a, b):
748+
"""
749+
Dot product between two vectors.
750+
a and b can be of type, SparseVector, DenseVector, np.ndarray
751+
or array.array.
752+
753+
>>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
754+
>>> b = Vectors.dense([23, 41, 9, 1])
755+
>>> Vectors.dot(a, b)
756+
27.0
757+
>>> Vectors.dot(a, a)
758+
17.0
759+
>>> Vectors.dot(a, np.array([0, 1, 2, 4]))
760+
16.0
761+
"""
762+
a, b = _convert_to_vector(a), _convert_to_vector(b)
763+
return a.dot(b)
764+
765+
@staticmethod
766+
def squared_distance(a, b):
767+
"""
768+
Squared distance between two vectors.
769+
a and b can be of type, SparseVector, DenseVector, np.ndarray
770+
or array.array.
771+
772+
>>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
773+
>>> b = Vectors.dense([2, 5, 4, 1])
774+
>>> a.squared_distance(b)
775+
51.0
776+
"""
777+
a, b = _convert_to_vector(a), _convert_to_vector(b)
778+
return a.squared_distance(b)
779+
780+
@staticmethod
781+
def norm(vec, p):
782+
"""
783+
Find norm of the given vector.
784+
"""
785+
return _convert_to_vector(vec).norm(p)
786+
787+
@staticmethod
788+
def parse(vectorString):
789+
if vectorString[0] == '[':
790+
return DenseVector.parse(vectorString)
791+
return SparseVector.parse(vectorString)
792+
793+
@staticmethod
794+
def zeros(num):
795+
return DenseVector(np.zeros(num))
796+
636797

637798
class Matrix(object):
638799
"""

python/pyspark/mllib/tests.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import tempfile
2525
import array as pyarray
2626

27-
from numpy import array, array_equal, zeros
27+
from numpy import array, array_equal, zeros, inf
2828
from py4j.protocol import Py4JJavaError
2929

3030
if sys.version_info[:2] <= (2, 6):
@@ -110,6 +110,10 @@ def test_dot(self):
110110
self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
111111
self.assertEquals(30.0, lst.dot(dv))
112112
self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
113+
self.assertEquals(Vectors.dot(sv, sv), 5.)
114+
self.assertEquals(Vectors.dot(sv, dv), 10.)
115+
self.assertEquals(Vectors.dot(dv, sv), 10.)
116+
self.assertEquals(Vectors.dot(sv, array([2, 5, 7, 8])), 21.0)
113117

114118
def test_squared_distance(self):
115119
sv = SparseVector(4, {1: 1, 3: 2})
@@ -220,6 +224,27 @@ def test_dense_matrix_is_transposed(self):
220224
self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
221225
self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
222226

227+
def test_parse_matrix(self):
228+
a = DenseVector([3, 4, 6, 7])
229+
self.assertTrue(a.toString(), '[3.0,4.0,6.0,7.0]')
230+
self.assertTrue(Vectors.parse(a.toString()), a)
231+
a = SparseVector(4, [0, 2], [3, 4])
232+
self.assertTrue(a.toString(), '(4,[0,2],[3.0,4.0])')
233+
self.assertTrue(Vectors.parse(a.toString()), a)
234+
235+
def test_norms(self):
236+
a = DenseVector([0, 2, 3, -1])
237+
self.assertAlmostEqual(a.norm(2), 3.742, 3)
238+
self.assertTrue(a.norm(1), 6)
239+
self.assertTrue(a.norm(inf), 3)
240+
a = SparseVector(4, [0, 2], [3, -4])
241+
self.assertAlmostEqual(a.norm(2), 5)
242+
self.assertTrue(a.norm(1), 7)
243+
self.assertTrue(a.norm(inf), 4)
244+
245+
tmp = SparseVector(4, [0, 2], [3, 0])
246+
self.assertEqual(tmp.numNonzeros(), 1)
247+
223248

224249
class ListTests(MLlibTestCase):
225250

0 commit comments

Comments
 (0)