remove NumericTokenizer

mengxr · mengxr · commit e86bf3825664 · 2014-05-14T12:48:18.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -23,120 +23,89 @@ import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 
 import org.apache.spark.SparkException
 
-private[mllib] object NumericTokenizer {
-  val NUMBER = -1
-  val END = -2
-}
-
-import NumericTokenizer._
-
 /**
- * Simple tokenizer for a numeric structure consisting of three types:
+ * Simple parser for a numeric structure consisting of three types:
  *
  *  - number: a double in Java's floating number format
  *  - array: an array of numbers stored as `[v0,v1,...,vn]`
  *  - tuple: a list of numbers, arrays, or tuples stored as `(...)`
- *
- * @param s input string
  */
-private[mllib] class NumericTokenizer(s: String) {
-
-  private var allowComma = false
-  private var _value = Double.NaN
-  private val stringTokenizer = new StringTokenizer(s, "()[],", true)
+private[mllib] object NumericParser {
 
-  /**
-   * Returns the most recent parsed number.
-   */
-  def value: Double = _value
+  /** Parses a string into a Double, an Array[Double], or a Seq[Any]. */
+  def parse(s: String): Any = {
+    val tokenizer = new StringTokenizer(s, "()[],", true)
+    if (tokenizer.hasMoreTokens()) {
+      val token = tokenizer.nextToken()
+      if (token == "(") {
+        parseTuple(tokenizer)
+      } else if (token == "[") {
+        parseArray(tokenizer)
+      } else {
+        // expecting a number
+        java.lang.Double.parseDouble(token)
+      }
+    } else {
+      throw new SparkException(s"Cannot find any token from the input string.")
+    }
+  }
 
-  /**
-   * Returns the next token, which could be any of the following:
-   *  - '[', ']', '(', or ')'.
-   *  - [[org.apache.spark.mllib.util.NumericTokenizer#NUMBER]], call value() to get its value.
-   *  - [[org.apache.spark.mllib.util.NumericTokenizer#END]].
-   */
-  def next(): Int = {
-    if (stringTokenizer.hasMoreTokens()) {
-      val token = stringTokenizer.nextToken()
-      if (token == "(" || token == "[") {
-        allowComma = false
-        token.charAt(0)
-      } else if (token == ")" || token == "]") {
-        allowComma = true
-        token.charAt(0)
+  private def parseArray(tokenizer: StringTokenizer): Array[Double] = {
+    val values = ArrayBuffer.empty[Double]
+    var parsing = true
+    var allowComma = false
+    var token: String = null
+    while (parsing && tokenizer.hasMoreTokens()) {
+      token = tokenizer.nextToken()
+      if (token == "]") {
+        parsing = false
       } else if (token == ",") {
         if (allowComma) {
           allowComma = false
-          next()
         } else {
           throw new SparkException("Found a ',' at a wrong position.")
         }
       } else {
         // expecting a number
-        _value = java.lang.Double.parseDouble(token)
+        values.append(java.lang.Double.parseDouble(token))
         allowComma = true
-        NUMBER
       }
-    } else {
-      END
     }
-  }
-}
-
-/**
- * Simple parser for tokens from [[org.apache.spark.mllib.util.NumericTokenizer]].
- */
-private[mllib] object NumericParser {
-
-  /** Parses a string into a Double, an Array[Double], or a Seq[Any]. */
-  def parse(s: String): Any = parse(new NumericTokenizer(s))
-
-  private def parse(tokenizer: NumericTokenizer): Any = {
-    val token = tokenizer.next()
-    if (token == NUMBER) {
-      tokenizer.value
-    } else if (token == '(') {
-      parseTuple(tokenizer)
-    } else if (token == '[') {
-      parseArray(tokenizer)
-    } else if (token == END) {
-      null
-    } else {
-      throw new SparkException(s"Cannot recognize token type: $token.")
-    }
-  }
-
-  private def parseArray(tokenizer: NumericTokenizer): Array[Double] = {
-    val values = ArrayBuffer.empty[Double]
-    var token = tokenizer.next()
-    while (token == NUMBER) {
-      values.append(tokenizer.value)
-      token = tokenizer.next()
-    }
-    if (token != ']') {
-      throw new SparkException(s"An array must end with ] but got $token.")
+    if (parsing) {
+      throw new SparkException(s"An array must end with ']'.")
     }
     values.toArray
   }
 
-  private def parseTuple(tokenizer: NumericTokenizer): Seq[_] = {
+  private def parseTuple(tokenizer: StringTokenizer): Seq[_] = {
     val items = ListBuffer.empty[Any]
-    var token = tokenizer.next()
-    while (token != ')' && token != END) {
-      if (token == NUMBER) {
-        items.append(tokenizer.value)
-      } else if (token == '(') {
+    var parsing = true
+    var allowComma = false
+    var token: String = null
+    while (parsing && tokenizer.hasMoreTokens()) {
+      token = tokenizer.nextToken()
+      if (token == "(") {
         items.append(parseTuple(tokenizer))
-      } else if (token == '[') {
+        allowComma = true
+      } else if (token == "[") {
         items.append(parseArray(tokenizer))
+        allowComma = true
+      } else if (token == ",") {
+        if (allowComma) {
+          allowComma = false
+        } else {
+          throw new SparkException("Found a ',' at a wrong position.")
+        }
+      } else if (token == ")") {
+        parsing = false
       } else {
-        throw new SparkException(s"Cannot recognize token type: $token.")
+        // expecting a number
+        items.append(java.lang.Double.parseDouble(token))
+        allowComma = true
       }
-      token = tokenizer.next()
     }
-    if (token != ')') {
-      throw new SparkException(s"A tuple must end with ) but got $token.")
+    if (parsing) {
+      throw new SparkException(s"A tuple must with ')'.")
     }
     items.toSeq
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -17,49 +17,26 @@
 
 package org.apache.spark.mllib.util
 
-import scala.collection.mutable.ListBuffer
-
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkException
 
 class NumericParserSuite extends FunSuite {
 
-  test("tokenizer") {
+  test("parser") {
     val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)"
-    val tokenizer = new NumericTokenizer(s)
-    var token = tokenizer.next()
-    val tokens = ListBuffer.empty[Any]
-    while (token != NumericTokenizer.END) {
-      token match {
-        case NumericTokenizer.NUMBER =>
-          tokens.append(tokenizer.value)
-        case other =>
-          tokens.append(token)
-      }
-      token = tokenizer.next()
-    }
-    val expected = Seq('(', '(', 1.0, 2e3, ')', -4.0, '[', 5e-6, 7e8, ']', 9.0, ')')
-    assert(expected === tokens)
+    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
+    assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3))
+    assert(parsed(1).asInstanceOf[Double] === -4.0)
+    assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8))
+    assert(parsed(3).asInstanceOf[Double] === 9.0)
 
     val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4")
     malformatted.foreach { s =>
       intercept[SparkException] {
-        val tokenizer = new NumericTokenizer(s)
-        while (tokenizer.next() != NumericTokenizer.END) {
-          // do nothing
-        }
+        NumericParser.parse(s)
         println(s"Didn't detect malformatted string $s.")
       }
     }
   }
-
-  test("parser") {
-    val s = "((1,2),4,[5,6],8)"
-    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
-    assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0))
-    assert(parsed(1).asInstanceOf[Double] === 4.0)
-    assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0, 6.0))
-    assert(parsed(3).asInstanceOf[Double] === 8.0)
-  }
 }