apache · johnc1231 · Mar 23, 2017 · Mar 27, 2017 · Mar 28, 2017 · Apr 2, 2017
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -91,15 +91,15 @@ class IndexedRowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024.
+   * Converts to BlockMatrix. Creates blocks with size 1024 x 1024.
    */
   @Since("1.3.0")
   def toBlockMatrix(): BlockMatrix = {
     toBlockMatrix(1024, 1024)
   }
 
   /**
-   * Converts to BlockMatrix. Creates blocks of `SparseMatrix`.
+   * Converts to BlockMatrix. Blocks may be sparse or dense depending on the sparsity of the rows.
    * @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have
    *                     a smaller value. Must be an integer value greater than 0.
    * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have
@@ -108,8 +108,64 @@ class IndexedRowMatrix @Since("1.0.0") (
    */
   @Since("1.3.0")
   def toBlockMatrix(rowsPerBlock: Int, colsPerBlock: Int): BlockMatrix = {
-    // TODO: This implementation may be optimized
-    toCoordinateMatrix().toBlockMatrix(rowsPerBlock, colsPerBlock)
+    require(rowsPerBlock > 0,
+      s"rowsPerBlock needs to be greater than 0. rowsPerBlock: $rowsPerBlock")
+    require(colsPerBlock > 0,
+      s"colsPerBlock needs to be greater than 0. colsPerBlock: $colsPerBlock")
+
+    val m = numRows()
+    val n = numCols()
+    val lastRowBlockIndex = m / rowsPerBlock
+    val lastColBlockIndex = n / colsPerBlock
+    val lastRowBlockSize = (m % rowsPerBlock).toInt
+    val lastColBlockSize = (n % colsPerBlock).toInt
+    val numRowBlocks = math.ceil(m.toDouble / rowsPerBlock).toInt
+    val numColBlocks = math.ceil(n.toDouble / colsPerBlock).toInt
+
+    val blocks = rows.flatMap { ir: IndexedRow =>
+      val blockRow = ir.index / rowsPerBlock
+      val rowInBlock = ir.index % rowsPerBlock
+
+      ir.vector match {
+        case SparseVector(size, indices, values) =>
+          indices.zip(values).map { case (index, value) =>
+            val blockColumn = index / colsPerBlock
+            val columnInBlock = index % colsPerBlock
+            ((blockRow.toInt, blockColumn.toInt), (rowInBlock.toInt, Array((value, columnInBlock))))
+          }
+        case DenseVector(values) =>
+          values.grouped(colsPerBlock)
+            .zipWithIndex
+            .map { case (values, blockColumn) =>
+              ((blockRow.toInt, blockColumn), (rowInBlock.toInt, values.zipWithIndex))
+            }
+      }
+    }.groupByKey(GridPartitioner(numRowBlocks, numColBlocks, rows.getNumPartitions)).map {
+      case ((blockRow, blockColumn), itr) =>
+        val actualNumRows =
+          if (blockRow == lastRowBlockIndex) lastRowBlockSize else rowsPerBlock
+        val actualNumColumns =
+          if (blockColumn == lastColBlockIndex) lastColBlockSize else colsPerBlock
+
+        val arraySize = actualNumRows * actualNumColumns
+        val matrixAsArray = new Array[Double](arraySize)
+        var countForValues = 0
+        itr.foreach { case (rowWithinBlock, valuesWithColumns) =>
+          valuesWithColumns.foreach { case (value, columnWithinBlock) =>
+            matrixAsArray.update(columnWithinBlock * actualNumRows + rowWithinBlock, value)
+            countForValues += 1
+          }
+        }
+        val denseMatrix = new DenseMatrix(actualNumRows, actualNumColumns, matrixAsArray)
+        val finalMatrix = if (countForValues / arraySize.toDouble >= 0.5) {
+          denseMatrix
+        } else {
+          denseMatrix.toSparse
+        }
+
+        ((blockRow, blockColumn), finalMatrix)
+    }
+    new BlockMatrix(blocks, rowsPerBlock, colsPerBlock, this.numRows(), this.numCols())
   }
 
   /**

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.linalg.distributed
 import breeze.linalg.{diag => brzDiag, DenseMatrix => BDM, DenseVector => BDV}
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.mllib.linalg.{Matrices, Vectors}
+import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
 
@@ -87,21 +87,74 @@ class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(coordMat.toBreeze() === idxRowMat.toBreeze())
   }
 
-  test("toBlockMatrix") {
-    val idxRowMat = new IndexedRowMatrix(indexedRows)
-    val blockMat = idxRowMat.toBlockMatrix(2, 2)
+  test("toBlockMatrix dense backing") {
+    val idxRowMatDense = new IndexedRowMatrix(indexedRows)
+
+    // Tests when n % colsPerBlock != 0
+    val blockMat = idxRowMatDense.toBlockMatrix(2, 2)
     assert(blockMat.numRows() === m)
     assert(blockMat.numCols() === n)
-    assert(blockMat.toBreeze() === idxRowMat.toBreeze())
+    assert(blockMat.toBreeze() === idxRowMatDense.toBreeze())
+
+    // Tests when m % rowsPerBlock != 0
+    val blockMat2 = idxRowMatDense.toBlockMatrix(3, 1)
+    assert(blockMat2.numRows() === m)
+    assert(blockMat2.numCols() === n)
+    assert(blockMat2.toBreeze() === idxRowMatDense.toBreeze())
 
     intercept[IllegalArgumentException] {
-      idxRowMat.toBlockMatrix(-1, 2)
+      idxRowMatDense.toBlockMatrix(-1, 2)
     }
     intercept[IllegalArgumentException] {
-      idxRowMat.toBlockMatrix(2, 0)
+      idxRowMatDense.toBlockMatrix(2, 0)
     }
+    assert(blockMat.blocks.map { case (_, matrix: Matrix) =>
+      matrix.isInstanceOf[DenseMatrix]}.reduce(_ && _))
+  }
+
+  test("toBlockMatrix sparse backing") {
+    val sparseData = Seq(
+      (3L, Vectors.sparse(3, Seq((0, 4.0))))
+    ).map(x => IndexedRow(x._1, x._2))
+
+    val idxRowMatSparse = new IndexedRowMatrix(sc.parallelize(sparseData))
+
+    // Tests when n % colsPerBlock != 0
+    val blockMat = idxRowMatSparse.toBlockMatrix(2, 2)
+    assert(blockMat.numRows() === m)
+    assert(blockMat.numCols() === n)
+    assert(blockMat.toBreeze() === idxRowMatSparse.toBreeze())
+
+    // Tests when m % rowsPerBlock != 0
+    val blockMat2 = idxRowMatSparse.toBlockMatrix(3, 1)
+    assert(blockMat2.numRows() === m)
+    assert(blockMat2.numCols() === n)
+    assert(blockMat2.toBreeze() === idxRowMatSparse.toBreeze())
+
+    assert(blockMat.blocks.map { case (_, matrix: Matrix) =>
+      matrix.isInstanceOf[SparseMatrix]}.reduce(_ && _))
+  }
+
+  test("toBlockMatrix mixed backing") {
+    val mixedData = Seq(
+      (0L, Vectors.dense(1, 2, 3)),
+      (3L, Vectors.sparse(3, Seq((0, 4.0)))))
+
+    val idxRowMatMixed = new IndexedRowMatrix(
+      sc.parallelize(mixedData.map(x => IndexedRow(x._1, x._2))))
+
+    val blockMat = idxRowMatMixed.toBlockMatrix(2, 2)
+    assert(blockMat.numRows() === m)
+    assert(blockMat.numCols() === n)
+    assert(blockMat.toBreeze() === idxRowMatMixed.toBreeze())
+
+    val blocks = blockMat.blocks.collect()
+
+    blocks.forall { case((row, col), matrix) =>
+      if (row == 0) matrix.isInstanceOf[DenseMatrix] else matrix.isInstanceOf[SparseMatrix]}
   }
 
+
   test("multiply a local matrix") {
     val A = new IndexedRowMatrix(indexedRows)
     val B = Matrices.dense(3, 2, Array(0.0, 1.0, 2.0, 3.0, 4.0, 5.0))