From 10be94aaf65f82e029f6f279624c3226e3708e41 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 5 Apr 2017 03:22:43 +0000 Subject: [PATCH 1/2] Make sure the converted csc matrix has sorted indices. --- python/pyspark/ml/linalg/__init__.py | 4 +++- python/pyspark/mllib/linalg/__init__.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py index b765343251965..8c0a37a9a1513 100644 --- a/python/pyspark/ml/linalg/__init__.py +++ b/python/pyspark/ml/linalg/__init__.py @@ -72,7 +72,9 @@ def _convert_to_vector(l): return DenseVector(l) elif _have_scipy and scipy.sparse.issparse(l): assert l.shape[1] == 1, "Expected column vector" - csc = l.tocsc() + # Make sure the converted csc_matrix has sorted indices. + csc = l.tocsr().tocsc() + assert csc.has_sorted_indices, "Converted CSC matrix should have sorted indices" return SparseVector(l.shape[0], csc.indices, csc.data) else: raise TypeError("Cannot convert type %s into Vector" % type(l)) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index 031f22c02098e..2329798958cb2 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -74,7 +74,9 @@ def _convert_to_vector(l): return DenseVector(l) elif _have_scipy and scipy.sparse.issparse(l): assert l.shape[1] == 1, "Expected column vector" - csc = l.tocsc() + # Make sure the converted csc_matrix has sorted indices. + csc = l.tocsr().tocsc() + assert csc.has_sorted_indices, "Converted CSC matrix should have sorted indices" return SparseVector(l.shape[0], csc.indices, csc.data) else: raise TypeError("Cannot convert type %s into Vector" % type(l)) From 2612d66c9eb5ebdf1ca5dfb61a687e37afbcb54a Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 5 Apr 2017 14:58:59 +0000 Subject: [PATCH 2/2] Add test. Call sort_indices(). --- python/pyspark/ml/linalg/__init__.py | 5 +++-- python/pyspark/mllib/linalg/__init__.py | 5 +++-- python/pyspark/mllib/tests.py | 11 +++++++++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py index 8c0a37a9a1513..ad1b487676fa7 100644 --- a/python/pyspark/ml/linalg/__init__.py +++ b/python/pyspark/ml/linalg/__init__.py @@ -73,8 +73,9 @@ def _convert_to_vector(l): elif _have_scipy and scipy.sparse.issparse(l): assert l.shape[1] == 1, "Expected column vector" # Make sure the converted csc_matrix has sorted indices. - csc = l.tocsr().tocsc() - assert csc.has_sorted_indices, "Converted CSC matrix should have sorted indices" + csc = l.tocsc() + if not csc.has_sorted_indices: + csc.sort_indices() return SparseVector(l.shape[0], csc.indices, csc.data) else: raise TypeError("Cannot convert type %s into Vector" % type(l)) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index 2329798958cb2..7b24b3c74a9fa 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -75,8 +75,9 @@ def _convert_to_vector(l): elif _have_scipy and scipy.sparse.issparse(l): assert l.shape[1] == 1, "Expected column vector" # Make sure the converted csc_matrix has sorted indices. - csc = l.tocsr().tocsc() - assert csc.has_sorted_indices, "Converted CSC matrix should have sorted indices" + csc = l.tocsc() + if not csc.has_sorted_indices: + csc.sort_indices() return SparseVector(l.shape[0], csc.indices, csc.data) else: raise TypeError("Cannot convert type %s into Vector" % type(l)) diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index c519883cdd73b..523b3f1113317 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -853,6 +853,17 @@ def serialize(l): self.assertEqual(sv, serialize(lil.tocsr())) self.assertEqual(sv, serialize(lil.todok())) + def test_convert_to_vector(self): + from scipy.sparse import csc_matrix + # Create a CSC matrix with non-sorted indices + indptr = array([0, 2]) + indices = array([3, 1]) + data = array([2.0, 1.0]) + csc = csc_matrix((data, indices, indptr)) + self.assertFalse(csc.has_sorted_indices) + sv = SparseVector(4, {1: 1, 3: 2}) + self.assertEqual(sv, _convert_to_vector(csc)) + def test_dot(self): from scipy.sparse import lil_matrix lil = lil_matrix((4, 1))