update tokenizer/parser implementation

mengxr · mengxr · commit 810d6df2701a · 2014-05-14T01:21:50.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -26,6 +26,7 @@ import scala.collection.JavaConverters._
 import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
 
 import org.apache.spark.mllib.util.NumericParser
+import org.apache.spark.SparkException
 
 /**
  * Represents a numeric vector, whose index type is Int and value type is Double.
@@ -141,7 +142,7 @@ object Vectors {
       case Seq(size: Double, indices: Array[Double], values: Array[Double]) =>
         Vectors.sparse(size.toInt, indices.map(_.toInt), values)
       case other =>
-        sys.error(s"Cannot parse $other.")
+       throw new SparkException(s"Cannot parse $other.")
     }
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.regression
 
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
 import org.apache.spark.mllib.util.NumericParser
+import org.apache.spark.SparkException
 
 /**
  * Class that represents the features and labels of a data point.
@@ -43,7 +44,7 @@ object LabeledPoint {
         case Seq(label: Double, numeric: Any) =>
           LabeledPoint(label, Vectors.parseNumeric(numeric))
         case other =>
-          sys.error(s"Cannot parse $other.")
+          throw new SparkException(s"Cannot parse $other.")
       }
     } else { // dense format used before v1.0
       val parts = s.split(',')
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -19,6 +19,8 @@ package org.apache.spark.mllib.util
 
 import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 
+import org.apache.spark.SparkException
+
 private[mllib] object NumericTokenizer {
   val NUMBER = -1
   val END = -2
@@ -61,39 +63,43 @@ private[mllib] class NumericTokenizer(s: String, start: Int, end: Int) {
    */
   def next(): Int = {
     if (cur < end) {
-      val c = s(cur)
-      c match {
-        case '(' | '[' =>
-          allowComma = false
-          cur += 1
-          c
-        case ')' | ']' =>
-          allowComma = true
+      val c = s.charAt(cur)
+      if (c == '(' || c == '[') {
+        allowComma = false
+        cur += 1
+        c
+      } else if (c == ')' || c == ']') {
+        allowComma = true
+        cur += 1
+        c
+      } else if (c == ',') {
+        if (allowComma) {
           cur += 1
-          c
-        case ',' =>
-          if (allowComma) {
-            cur += 1
-            allowComma = false
-            next()
+          allowComma = false
+          next()
+        } else {
+          throw new SparkException(s"Found a ',' at a wrong location: $cur.")
+        }
+      } else {
+        // expecting a number
+        var inNumber = true
+        val beginAt = cur
+        while (cur < end && inNumber) {
+          val d = s.charAt(cur)
+          if (d == ')' || d == ']' || d == ',') {
+            inNumber = false
           } else {
-            sys.error("Found a ',' at a wrong location.")
-          }
-        case other => // expecting a number
-          var inNumber = true
-          val sb = new StringBuilder()
-          while (cur < end && inNumber) {
-            val d = s(cur)
-            if (d == ')' || d == ']' || d == ',') {
-              inNumber = false
-            } else {
-              sb.append(d)
-              cur += 1
-            }
+            cur += 1
           }
-          _value = sb.toString().toDouble
-          allowComma = true
-          NUMBER
+        }
+        try {
+          _value = java.lang.Double.parseDouble(s.substring(beginAt, cur))
+        } catch {
+          case e: Throwable =>
+            throw new SparkException("Error parsing a number", e)
+        }
+        allowComma = true
+        NUMBER
       }
     } else {
       END
@@ -110,15 +116,17 @@ private[mllib] object NumericParser {
   def parse(s: String): Any = parse(new NumericTokenizer(s))
 
   private def parse(tokenizer: NumericTokenizer): Any = {
-    tokenizer.next() match {
-      case '(' =>
-        parseTuple(tokenizer)
-      case '[' =>
-        parseArray(tokenizer)
-      case NUMBER =>
-        tokenizer.value
-      case END =>
-        null
+    val token = tokenizer.next()
+    if (token == NUMBER) {
+      tokenizer.value
+    } else if (token == '(') {
+      parseTuple(tokenizer)
+    } else if (token == '[') {
+      parseArray(tokenizer)
+    } else if (token == END) {
+      null
+    } else {
+      throw new SparkException(s"Cannot recgonize token type: $token.")
     }
   }
 
@@ -129,25 +137,30 @@ private[mllib] object NumericParser {
       values.append(tokenizer.value)
       token = tokenizer.next()
     }
-    require(token == ']')
+    if (token != ']') {
+      throw new SparkException(s"An array must end with ] but got $token.")
+    }
     values.toArray
   }
 
   private def parseTuple(tokenizer: NumericTokenizer): Seq[_] = {
     val items = ListBuffer.empty[Any]
     var token = tokenizer.next()
     while (token != ')' && token != END) {
-      token match {
-        case '(' =>
-          items.append(parseTuple(tokenizer))
-        case '[' =>
-          items.append(parseArray(tokenizer))
-        case NUMBER =>
-          items.append(tokenizer.value)
+      if (token == NUMBER) {
+        items.append(tokenizer.value)
+      } else if (token == '(') {
+        items.append(parseTuple(tokenizer))
+      } else if (token == '[') {
+        items.append(parseArray(tokenizer))
+      } else {
+        throw new SparkException(s"Cannot recognize token type: $token.")
       }
       token = tokenizer.next()
     }
-    require(token == ')')
+    if (token != ')') {
+      throw new SparkException(s"A tuple must end with ) but got $token.")
+    }
     items.toSeq
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark.mllib.linalg
 
 import org.scalatest.FunSuite
 
+import org.apache.spark.SparkException
+
 class VectorsSuite extends FunSuite {
 
   val arr = Array(0.1, 0.0, 0.3, 0.4)
@@ -105,7 +107,7 @@ class VectorsSuite extends FunSuite {
     val vectors = Seq(
       Vectors.dense(Array.empty[Double]),
       Vectors.dense(1.0),
-      Vectors.dense(1.0, 0.0, -2.0),
+      Vectors.dense(1.0E6, 0.0, -2.0e-7),
       Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
       Vectors.sparse(1, Array(0), Array(1.0)),
       Vectors.sparse(3, Array(0, 2), Array(1.0, -2.0)))
@@ -115,9 +117,9 @@ class VectorsSuite extends FunSuite {
       assert(v === v1)
     }
 
-    val malformatted = Seq("1", "[1,,]", "[1,2", "(1,[1,2])", "(1,[1],[2.0,1.0])")
+    val malformatted = Seq("1", "[1,,]", "[1,2b]", "(1,[1,2])", "([1],[2.0,1.0])")
     malformatted.foreach { s =>
-      intercept[RuntimeException] {
+      intercept[SparkException] {
         Vectors.parse(s)
         println(s"Didn't detect malformatted string $s.")
       }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -21,6 +21,8 @@ import scala.collection.mutable.ListBuffer
 
 import org.scalatest.FunSuite
 
+import org.apache.spark.SparkException
+
 class NumericParserSuite extends FunSuite {
 
   test("tokenizer") {
@@ -42,7 +44,7 @@ class NumericParserSuite extends FunSuite {
 
     val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4")
     malformatted.foreach { s =>
-      intercept[RuntimeException] {
+      intercept[SparkException] {
         val tokenizer = new NumericTokenizer(s)
         while (tokenizer.next() != NumericTokenizer.END) {
           // do nothing

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@ import scala.collection.JavaConverters._`
`26`	`26`	`import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}`
`27`	`27`
`28`	`28`	`import org.apache.spark.mllib.util.NumericParser`
	`29`	`+import org.apache.spark.SparkException`
`29`	`30`
`30`	`31`	`/**`
`31`	`32`	`* Represents a numeric vector, whose index type is Int and value type is Double.`
`@@ -141,7 +142,7 @@ object Vectors {`
`141`	`142`	`case Seq(size: Double, indices: Array[Double], values: Array[Double]) =>`
`142`	`143`	`Vectors.sparse(size.toInt, indices.map(_.toInt), values)`
`143`	`144`	`case other =>`
`144`		`- sys.error(s"Cannot parse $other.")`
	`145`	`+ throw new SparkException(s"Cannot parse $other.")`
`145`	`146`	`}`
`146`	`147`	`}`
`147`	`148`