Skip to content

Commit e86bf38

Browse files
committed
remove NumericTokenizer
1 parent 050fca4 commit e86bf38

File tree

2 files changed

+61
-115
lines changed

2 files changed

+61
-115
lines changed

mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala

Lines changed: 54 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -23,120 +23,89 @@ import scala.collection.mutable.{ArrayBuffer, ListBuffer}
2323

2424
import org.apache.spark.SparkException
2525

26-
private[mllib] object NumericTokenizer {
27-
val NUMBER = -1
28-
val END = -2
29-
}
30-
31-
import NumericTokenizer._
32-
3326
/**
34-
* Simple tokenizer for a numeric structure consisting of three types:
27+
* Simple parser for a numeric structure consisting of three types:
3528
*
3629
* - number: a double in Java's floating number format
3730
* - array: an array of numbers stored as `[v0,v1,...,vn]`
3831
* - tuple: a list of numbers, arrays, or tuples stored as `(...)`
39-
*
40-
* @param s input string
4132
*/
42-
private[mllib] class NumericTokenizer(s: String) {
43-
44-
private var allowComma = false
45-
private var _value = Double.NaN
46-
private val stringTokenizer = new StringTokenizer(s, "()[],", true)
33+
private[mllib] object NumericParser {
4734

48-
/**
49-
* Returns the most recent parsed number.
50-
*/
51-
def value: Double = _value
35+
/** Parses a string into a Double, an Array[Double], or a Seq[Any]. */
36+
def parse(s: String): Any = {
37+
val tokenizer = new StringTokenizer(s, "()[],", true)
38+
if (tokenizer.hasMoreTokens()) {
39+
val token = tokenizer.nextToken()
40+
if (token == "(") {
41+
parseTuple(tokenizer)
42+
} else if (token == "[") {
43+
parseArray(tokenizer)
44+
} else {
45+
// expecting a number
46+
java.lang.Double.parseDouble(token)
47+
}
48+
} else {
49+
throw new SparkException(s"Cannot find any token from the input string.")
50+
}
51+
}
5252

53-
/**
54-
* Returns the next token, which could be any of the following:
55-
* - '[', ']', '(', or ')'.
56-
* - [[org.apache.spark.mllib.util.NumericTokenizer#NUMBER]], call value() to get its value.
57-
* - [[org.apache.spark.mllib.util.NumericTokenizer#END]].
58-
*/
59-
def next(): Int = {
60-
if (stringTokenizer.hasMoreTokens()) {
61-
val token = stringTokenizer.nextToken()
62-
if (token == "(" || token == "[") {
63-
allowComma = false
64-
token.charAt(0)
65-
} else if (token == ")" || token == "]") {
66-
allowComma = true
67-
token.charAt(0)
53+
private def parseArray(tokenizer: StringTokenizer): Array[Double] = {
54+
val values = ArrayBuffer.empty[Double]
55+
var parsing = true
56+
var allowComma = false
57+
var token: String = null
58+
while (parsing && tokenizer.hasMoreTokens()) {
59+
token = tokenizer.nextToken()
60+
if (token == "]") {
61+
parsing = false
6862
} else if (token == ",") {
6963
if (allowComma) {
7064
allowComma = false
71-
next()
7265
} else {
7366
throw new SparkException("Found a ',' at a wrong position.")
7467
}
7568
} else {
7669
// expecting a number
77-
_value = java.lang.Double.parseDouble(token)
70+
values.append(java.lang.Double.parseDouble(token))
7871
allowComma = true
79-
NUMBER
8072
}
81-
} else {
82-
END
8373
}
84-
}
85-
}
86-
87-
/**
88-
* Simple parser for tokens from [[org.apache.spark.mllib.util.NumericTokenizer]].
89-
*/
90-
private[mllib] object NumericParser {
91-
92-
/** Parses a string into a Double, an Array[Double], or a Seq[Any]. */
93-
def parse(s: String): Any = parse(new NumericTokenizer(s))
94-
95-
private def parse(tokenizer: NumericTokenizer): Any = {
96-
val token = tokenizer.next()
97-
if (token == NUMBER) {
98-
tokenizer.value
99-
} else if (token == '(') {
100-
parseTuple(tokenizer)
101-
} else if (token == '[') {
102-
parseArray(tokenizer)
103-
} else if (token == END) {
104-
null
105-
} else {
106-
throw new SparkException(s"Cannot recognize token type: $token.")
107-
}
108-
}
109-
110-
private def parseArray(tokenizer: NumericTokenizer): Array[Double] = {
111-
val values = ArrayBuffer.empty[Double]
112-
var token = tokenizer.next()
113-
while (token == NUMBER) {
114-
values.append(tokenizer.value)
115-
token = tokenizer.next()
116-
}
117-
if (token != ']') {
118-
throw new SparkException(s"An array must end with ] but got $token.")
74+
if (parsing) {
75+
throw new SparkException(s"An array must end with ']'.")
11976
}
12077
values.toArray
12178
}
12279

123-
private def parseTuple(tokenizer: NumericTokenizer): Seq[_] = {
80+
private def parseTuple(tokenizer: StringTokenizer): Seq[_] = {
12481
val items = ListBuffer.empty[Any]
125-
var token = tokenizer.next()
126-
while (token != ')' && token != END) {
127-
if (token == NUMBER) {
128-
items.append(tokenizer.value)
129-
} else if (token == '(') {
82+
var parsing = true
83+
var allowComma = false
84+
var token: String = null
85+
while (parsing && tokenizer.hasMoreTokens()) {
86+
token = tokenizer.nextToken()
87+
if (token == "(") {
13088
items.append(parseTuple(tokenizer))
131-
} else if (token == '[') {
89+
allowComma = true
90+
} else if (token == "[") {
13291
items.append(parseArray(tokenizer))
92+
allowComma = true
93+
} else if (token == ",") {
94+
if (allowComma) {
95+
allowComma = false
96+
} else {
97+
throw new SparkException("Found a ',' at a wrong position.")
98+
}
99+
} else if (token == ")") {
100+
parsing = false
133101
} else {
134-
throw new SparkException(s"Cannot recognize token type: $token.")
102+
// expecting a number
103+
items.append(java.lang.Double.parseDouble(token))
104+
allowComma = true
135105
}
136-
token = tokenizer.next()
137106
}
138-
if (token != ')') {
139-
throw new SparkException(s"A tuple must end with ) but got $token.")
107+
if (parsing) {
108+
throw new SparkException(s"A tuple must with ')'.")
140109
}
141110
items.toSeq
142111
}

mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,49 +17,26 @@
1717

1818
package org.apache.spark.mllib.util
1919

20-
import scala.collection.mutable.ListBuffer
21-
2220
import org.scalatest.FunSuite
2321

2422
import org.apache.spark.SparkException
2523

2624
class NumericParserSuite extends FunSuite {
2725

28-
test("tokenizer") {
26+
test("parser") {
2927
val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)"
30-
val tokenizer = new NumericTokenizer(s)
31-
var token = tokenizer.next()
32-
val tokens = ListBuffer.empty[Any]
33-
while (token != NumericTokenizer.END) {
34-
token match {
35-
case NumericTokenizer.NUMBER =>
36-
tokens.append(tokenizer.value)
37-
case other =>
38-
tokens.append(token)
39-
}
40-
token = tokenizer.next()
41-
}
42-
val expected = Seq('(', '(', 1.0, 2e3, ')', -4.0, '[', 5e-6, 7e8, ']', 9.0, ')')
43-
assert(expected === tokens)
28+
val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
29+
assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3))
30+
assert(parsed(1).asInstanceOf[Double] === -4.0)
31+
assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8))
32+
assert(parsed(3).asInstanceOf[Double] === 9.0)
4433

4534
val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4")
4635
malformatted.foreach { s =>
4736
intercept[SparkException] {
48-
val tokenizer = new NumericTokenizer(s)
49-
while (tokenizer.next() != NumericTokenizer.END) {
50-
// do nothing
51-
}
37+
NumericParser.parse(s)
5238
println(s"Didn't detect malformatted string $s.")
5339
}
5440
}
5541
}
56-
57-
test("parser") {
58-
val s = "((1,2),4,[5,6],8)"
59-
val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
60-
assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0))
61-
assert(parsed(1).asInstanceOf[Double] === 4.0)
62-
assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0, 6.0))
63-
assert(parsed(3).asInstanceOf[Double] === 8.0)
64-
}
6542
}

0 commit comments

Comments
 (0)