|
25 | 25 |
|
26 | 26 | import sys |
27 | 27 | import array |
| 28 | +from math import sqrt |
28 | 29 |
|
29 | 30 | if sys.version >= '3': |
30 | 31 | basestring = str |
@@ -208,9 +209,55 @@ def __init__(self, ar): |
208 | 209 | ar = ar.astype(np.float64) |
209 | 210 | self.array = ar |
210 | 211 |
|
| 212 | + def toString(self): |
| 213 | + """ |
| 214 | + Convert DenseVector to string representation. |
| 215 | +
|
| 216 | + >>> a = DenseVector([0, 1, 2, 3]) |
| 217 | + >>> a.toString() |
| 218 | + '[0.0,1.0,2.0,3.0]' |
| 219 | + """ |
| 220 | + return str(self) |
| 221 | + |
| 222 | + def copy(self): |
| 223 | + return DenseVector(np.copy(self.array)) |
| 224 | + |
| 225 | + @staticmethod |
| 226 | + def parse(vectorString): |
| 227 | + """ |
| 228 | + Parse string representation back into the DenseVector. |
| 229 | +
|
| 230 | + >>> DenseVector.parse('[0.0,1.0,2.0,3.0]') |
| 231 | + DenseVector([0.0, 1.0, 2.0, 3.0]) |
| 232 | + """ |
| 233 | + vectorString = vectorString[1:-1] |
| 234 | + return DenseVector([float(val) for val in vectorString.split(',')]) |
| 235 | + |
211 | 236 | def __reduce__(self): |
212 | 237 | return DenseVector, (self.array.tostring(),) |
213 | 238 |
|
| 239 | + def numNonzeros(self): |
| 240 | + return np.nonzero(self.array)[0].size |
| 241 | + |
| 242 | + def norm(self, p): |
| 243 | + """ |
| 244 | + Calculte the norm of a DenseVector. |
| 245 | +
|
| 246 | + >>> a = DenseVector([0, -1, 2, -3]) |
| 247 | + >>> a.norm(2) |
| 248 | + 3.7... |
| 249 | + >>> a.norm(1) |
| 250 | + 6.0 |
| 251 | + """ |
| 252 | + if p == 1: |
| 253 | + return np.sum(np.abs(self.array)) |
| 254 | + elif p == 2: |
| 255 | + return sqrt(np.dot(self.array, self.array)) |
| 256 | + elif p == np.inf: |
| 257 | + return np.max(np.abs(self.array)) |
| 258 | + else: |
| 259 | + return pow(np.power(self.array, p), 1.0 / p) |
| 260 | + |
214 | 261 | def dot(self, other): |
215 | 262 | """ |
216 | 263 | Compute the dot product of two Vectors. We support |
@@ -387,9 +434,60 @@ def __init__(self, size, *args): |
387 | 434 | if self.indices[i] >= self.indices[i + 1]: |
388 | 435 | raise TypeError("indices array must be sorted") |
389 | 436 |
|
| 437 | + def copy(self): |
| 438 | + return SparseVector(self.size, np.copy(self.indices), np.copy(self.values)) |
| 439 | + |
| 440 | + def numNonzeros(self): |
| 441 | + return np.nonzero(self.values)[0].size |
| 442 | + |
| 443 | + def norm(self, p): |
| 444 | + """ |
| 445 | + Calculte the norm of a SparseVector. |
| 446 | +
|
| 447 | + >>> a = SparseVector(4, [0, 1], [3., -4.]) |
| 448 | + >>> a.norm(1) |
| 449 | + 7.0 |
| 450 | + >>> a.norm(2) |
| 451 | + 5.0 |
| 452 | + """ |
| 453 | + if p == 1: |
| 454 | + return np.sum(np.abs(self.values)) |
| 455 | + elif p == 2: |
| 456 | + return sqrt(np.dot(self.values, self.values)) |
| 457 | + elif p == np.inf: |
| 458 | + return np.max(np.abs(self.values)) |
| 459 | + else: |
| 460 | + return pow(np.power(self.values, p), 1.0 / p) |
| 461 | + |
390 | 462 | def __reduce__(self): |
391 | 463 | return (SparseVector, (self.size, self.indices.tostring(), self.values.tostring())) |
392 | 464 |
|
| 465 | + def toString(self): |
| 466 | + """ |
| 467 | + Convert SparseVector to string representation. |
| 468 | +
|
| 469 | + >>> a = SparseVector(4, [0, 1], [4, 5]) |
| 470 | + >>> a.toString() |
| 471 | + '(4,[0,1],[4.0,5.0])' |
| 472 | + """ |
| 473 | + return str(self) |
| 474 | + |
| 475 | + @staticmethod |
| 476 | + def parse(vectorString): |
| 477 | + """ |
| 478 | + Parse string representation back into the DenseVector. |
| 479 | +
|
| 480 | + >>> SparseVector.parse('(4,[0,1],[4.0,5.0])') |
| 481 | + SparseVector(4, {0: 4.0, 1: 5.0}) |
| 482 | + """ |
| 483 | + size = int(vectorString[1]) |
| 484 | + ind_end = vectorString.find(']') |
| 485 | + index_string = vectorString[4: ind_end] |
| 486 | + indices = [int(ind) for ind in index_string.split(',')] |
| 487 | + value_string = vectorString[ind_end + 3: -2] |
| 488 | + values = [float(val) for val in value_string.split(',')] |
| 489 | + return SparseVector(size, indices, values) |
| 490 | + |
393 | 491 | def dot(self, other): |
394 | 492 | """ |
395 | 493 | Dot product with a SparseVector or 1- or 2-dimensional Numpy array. |
@@ -430,12 +528,15 @@ def dot(self, other): |
430 | 528 |
|
431 | 529 | assert len(self) == _vector_size(other), "dimension mismatch" |
432 | 530 |
|
433 | | - if type(other) in (np.ndarray, array.array, DenseVector): |
| 531 | + if type(other) in (np.ndarray, array.array): |
434 | 532 | result = 0.0 |
435 | | - for i in xrange(len(self.indices)): |
436 | | - result += self.values[i] * other[self.indices[i]] |
| 533 | + for i, ind in enumerate(self.indices): |
| 534 | + result += self.values[i] * other[ind] |
437 | 535 | return result |
438 | 536 |
|
| 537 | + elif isinstance(other, DenseVector): |
| 538 | + return np.dot(other.toArray()[self.indices], self.values) |
| 539 | + |
439 | 540 | elif type(other) is SparseVector: |
440 | 541 | result = 0.0 |
441 | 542 | i, j = 0, 0 |
@@ -479,19 +580,28 @@ def squared_distance(self, other): |
479 | 580 | AssertionError: dimension mismatch |
480 | 581 | """ |
481 | 582 | assert len(self) == _vector_size(other), "dimension mismatch" |
482 | | - if type(other) in (list, array.array, DenseVector, np.array, np.ndarray): |
| 583 | + if type(other) in (list, array.array, np.array, np.ndarray): |
483 | 584 | if type(other) is np.array and other.ndim != 1: |
484 | 585 | raise Exception("Cannot call squared_distance with %d-dimensional array" % |
485 | 586 | other.ndim) |
486 | 587 | result = 0.0 |
487 | 588 | j = 0 # index into our own array |
488 | | - for i in xrange(len(other)): |
| 589 | + for i, other_ind in enumerate(other): |
489 | 590 | if j < len(self.indices) and self.indices[j] == i: |
490 | | - diff = self.values[j] - other[i] |
| 591 | + diff = self.values[j] - other_ind |
491 | 592 | result += diff * diff |
492 | 593 | j += 1 |
493 | 594 | else: |
494 | | - result += other[i] * other[i] |
| 595 | + result += other_ind * other_ind |
| 596 | + return result |
| 597 | + |
| 598 | + elif isinstance(other, DenseVector): |
| 599 | + bool_ind = np.zeros(len(other), dtype=bool) |
| 600 | + bool_ind[self.indices] = True |
| 601 | + dist = other.toArray()[bool_ind] - self.values |
| 602 | + result = np.dot(dist, dist) |
| 603 | + other_values = other.toArray()[~bool_ind] |
| 604 | + result += np.dot(other_values, other_values) |
495 | 605 | return result |
496 | 606 |
|
497 | 607 | elif type(other) is SparseVector: |
@@ -633,6 +743,57 @@ def stringify(vector): |
633 | 743 | """ |
634 | 744 | return str(vector) |
635 | 745 |
|
| 746 | + @staticmethod |
| 747 | + def dot(a, b): |
| 748 | + """ |
| 749 | + Dot product between two vectors. |
| 750 | + a and b can be of type, SparseVector, DenseVector, np.ndarray |
| 751 | + or array.array. |
| 752 | +
|
| 753 | + >>> a = Vectors.sparse(4, [(0, 1), (3, 4)]) |
| 754 | + >>> b = Vectors.dense([23, 41, 9, 1]) |
| 755 | + >>> Vectors.dot(a, b) |
| 756 | + 27.0 |
| 757 | + >>> Vectors.dot(a, a) |
| 758 | + 17.0 |
| 759 | + >>> Vectors.dot(a, np.array([0, 1, 2, 4])) |
| 760 | + 16.0 |
| 761 | + """ |
| 762 | + a, b = _convert_to_vector(a), _convert_to_vector(b) |
| 763 | + return a.dot(b) |
| 764 | + |
| 765 | + @staticmethod |
| 766 | + def squared_distance(a, b): |
| 767 | + """ |
| 768 | + Squared distance between two vectors. |
| 769 | + a and b can be of type, SparseVector, DenseVector, np.ndarray |
| 770 | + or array.array. |
| 771 | +
|
| 772 | + >>> a = Vectors.sparse(4, [(0, 1), (3, 4)]) |
| 773 | + >>> b = Vectors.dense([2, 5, 4, 1]) |
| 774 | + >>> a.squared_distance(b) |
| 775 | + 51.0 |
| 776 | + """ |
| 777 | + a, b = _convert_to_vector(a), _convert_to_vector(b) |
| 778 | + return a.squared_distance(b) |
| 779 | + |
| 780 | + @staticmethod |
| 781 | + def norm(vec, p): |
| 782 | + """ |
| 783 | + Find norm of the given vector. |
| 784 | + """ |
| 785 | + return _convert_to_vector(vec).norm(p) |
| 786 | + |
| 787 | + @staticmethod |
| 788 | + def parse(vectorString): |
| 789 | + if vectorString[0] == '[': |
| 790 | + return DenseVector.parse(vectorString) |
| 791 | + return SparseVector.parse(vectorString) |
| 792 | + |
| 793 | + @staticmethod |
| 794 | + def zeros(num): |
| 795 | + return DenseVector(np.zeros(num)) |
| 796 | + |
636 | 797 |
|
637 | 798 | class Matrix(object): |
638 | 799 | """ |
|
0 commit comments