Skip to content

Commit ec35c80

Browse files
committed
Adds fixes from review:
* orders imports in stringOperations.scala * Substring.dataType throws exception if children are unresolved * inlines Substring.slice (~11.5% performance improvement on microbenchmark runs) * adds a special `toString` case for two-argument SUBSTR expressions * removes spurious I_ prefix to SUBSTR(ING) in HiveQL.scala Thanks to @concretevitamin for prompt and useful feedback!
1 parent 4f3bfdb commit ec35c80

File tree

2 files changed

+12
-6
lines changed

2 files changed

+12
-6
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,9 @@ import java.util.regex.Pattern
2121

2222
import scala.collection.IndexedSeqOptimized
2323

24-
import org.apache.spark.sql.catalyst.types.DataType
25-
import org.apache.spark.sql.catalyst.types.StringType
26-
import org.apache.spark.sql.catalyst.types.BinaryType
27-
import org.apache.spark.sql.catalyst.types.BooleanType
24+
25+
import org.apache.spark.sql.catalyst.analysis.UnresolvedException
26+
import org.apache.spark.sql.catalyst.types.{BinaryType, BooleanType, DataType, StringType}
2827

2928
trait StringRegexExpression {
3029
self: BinaryExpression =>
@@ -219,13 +218,17 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends
219218

220219
def nullable: Boolean = true
221220
def dataType: DataType = {
221+
if (!resolved) {
222+
throw new UnresolvedException(this, s"Cannot resolve since $children are not resolved")
223+
}
222224
if (str.dataType == BinaryType) str.dataType else StringType
223225
}
224226

225227
def references = children.flatMap(_.references).toSet
226228

227229
override def children = str :: pos :: len :: Nil
228230

231+
@inline
229232
def slice[T, C <% IndexedSeqOptimized[T,_]](str: C, startPos: Int, sliceLen: Int): Any = {
230233
val len = str.length
231234
// Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
@@ -267,5 +270,8 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends
267270
}
268271
}
269272

270-
override def toString = s"SUBSTR($str, $pos, $len)"
273+
override def toString = len match {
274+
case max if max == Integer.MAX_VALUE => s"SUBSTR($str, $pos)"
275+
case _ => s"SUBSTR($str, $pos, $len)"
276+
}
271277
}

sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -860,7 +860,7 @@ private[hive] object HiveQl {
860860
val BETWEEN = "(?i)BETWEEN".r
861861
val WHEN = "(?i)WHEN".r
862862
val CASE = "(?i)CASE".r
863-
val SUBSTR = "(?i)I_SUBSTR(?:ING)?".r
863+
val SUBSTR = "(?i)SUBSTR(?:ING)?".r
864864

865865
protected def nodeToExpr(node: Node): Expression = node match {
866866
/* Attribute References */

0 commit comments

Comments
 (0)