-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-8238][SPARK-8239][SPARK-8242][SPARK-8243][SPARK-8268][SQL]Add ascii/base64/unbase64/encode/decode functions #6843
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
491ce7b
e2df768
96170fc
ed5c19c
9d6f9f4
78dee7d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -298,3 +298,120 @@ case class StringLength(child: Expression) extends UnaryExpression with ExpectsI | |
|
|
||
| override def prettyName: String = "length" | ||
| } | ||
|
|
||
| /** | ||
| * Returns the numeric value of the first character of str. | ||
| */ | ||
| case class Ascii(child: Expression) extends UnaryExpression with ExpectsInputTypes { | ||
| override def dataType: DataType = IntegerType | ||
| override def inputTypes: Seq[DataType] = Seq(StringType) | ||
|
|
||
| override def eval(input: InternalRow): Any = { | ||
| val string = child.eval(input) | ||
| if (string == null) { | ||
| null | ||
| } else { | ||
| val bytes = string.asInstanceOf[UTF8String].getBytes | ||
| if (bytes.length > 0) { | ||
| bytes(0).asInstanceOf[Int] | ||
| } else { | ||
| 0 | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Converts the argument from binary to a base 64 string. | ||
| */ | ||
| case class Base64(child: Expression) extends UnaryExpression with ExpectsInputTypes { | ||
| override def dataType: DataType = StringType | ||
| override def inputTypes: Seq[DataType] = Seq(BinaryType) | ||
|
|
||
| override def eval(input: InternalRow): Any = { | ||
| val bytes = child.eval(input) | ||
| if (bytes == null) { | ||
| null | ||
| } else { | ||
| UTF8String.fromBytes( | ||
| org.apache.commons.codec.binary.Base64.encodeBase64( | ||
| bytes.asInstanceOf[Array[Byte]])) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Converts the argument from a base 64 string to BINARY. | ||
| */ | ||
| case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInputTypes { | ||
| override def dataType: DataType = BinaryType | ||
| override def inputTypes: Seq[DataType] = Seq(StringType) | ||
|
|
||
| override def eval(input: InternalRow): Any = { | ||
| val string = child.eval(input) | ||
| if (string == null) { | ||
| null | ||
| } else { | ||
| org.apache.commons.codec.binary.Base64.decodeBase64(string.asInstanceOf[UTF8String].toString) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Decodes the first argument into a String using the provided character set | ||
| * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). | ||
| * If either argument is null, the result will also be null. (As of Hive 0.12.0.). | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove "As of Hive 0.12.0" |
||
| */ | ||
| case class Decode(bin: Expression, charset: Expression) extends Expression with ExpectsInputTypes { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you make this extend BinaryExpression? You can just define def bin = left, and def charset = right.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually that's my intention, as I think the parameters is asymmetric semantically. Not sure if you are thinking the code impovement like #7157? |
||
| override def children: Seq[Expression] = bin :: charset :: Nil | ||
| override def foldable: Boolean = bin.foldable && charset.foldable | ||
| override def nullable: Boolean = bin.nullable || charset.nullable | ||
| override def dataType: DataType = StringType | ||
| override def inputTypes: Seq[DataType] = Seq(BinaryType, StringType) | ||
|
|
||
| override def eval(input: InternalRow): Any = { | ||
| val l = bin.eval(input) | ||
| if (l == null) { | ||
| null | ||
| } else { | ||
| val r = charset.eval(input) | ||
| if (r == null) { | ||
| null | ||
| } else { | ||
| val fromCharset = r.asInstanceOf[UTF8String].toString | ||
| UTF8String.fromString(new String(l.asInstanceOf[Array[Byte]], fromCharset)) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Encodes the first argument into a BINARY using the provided character set | ||
| * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). | ||
| * If either argument is null, the result will also be null. (As of Hive 0.12.0.) | ||
| */ | ||
| case class Encode(value: Expression, charset: Expression) | ||
| extends Expression with ExpectsInputTypes { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here extend BinaryExpression too |
||
| override def children: Seq[Expression] = value :: charset :: Nil | ||
| override def foldable: Boolean = value.foldable && charset.foldable | ||
| override def nullable: Boolean = value.nullable || charset.nullable | ||
| override def dataType: DataType = BinaryType | ||
| override def inputTypes: Seq[DataType] = Seq(StringType, StringType) | ||
|
|
||
| override def eval(input: InternalRow): Any = { | ||
| val l = value.eval(input) | ||
| if (l == null) { | ||
| null | ||
| } else { | ||
| val r = charset.eval(input) | ||
| if (r == null) { | ||
| null | ||
| } else { | ||
| val toCharset = r.asInstanceOf[UTF8String].toString | ||
| l.asInstanceOf[UTF8String].toString.getBytes(toCharset) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1543,18 +1543,111 @@ object functions { | |
|
|
||
| /** | ||
| * Computes the length of a given string value | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def strlen(e: Column): Column = StringLength(e.expr) | ||
|
|
||
| /** | ||
| * Computes the length of a given string column | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def strlen(columnName: String): Column = strlen(Column(columnName)) | ||
|
|
||
| /** | ||
| * Computes the numeric value of the first character of the specified string value. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def ascii(e: Column): Column = Ascii(e.expr) | ||
|
|
||
| /** | ||
| * Computes the numeric value of the first character of the specified string column. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def ascii(columnName: String): Column = ascii(Column(columnName)) | ||
|
|
||
| /** | ||
| * Computes the specified value from binary to a base64 string. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def base64(e: Column): Column = Base64(e.expr) | ||
|
|
||
| /** | ||
| * Computes the specified column from binary to a base64 string. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def base64(columnName: String): Column = base64(Column(columnName)) | ||
|
|
||
| /** | ||
| * Computes the specified value from a base64 string to binary. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def unbase64(e: Column): Column = UnBase64(e.expr) | ||
|
|
||
| /** | ||
| * Computes the specified column from a base64 string to binary. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def unbase64(columnName: String): Column = unbase64(Column(columnName)) | ||
|
|
||
| /** | ||
| * Computes the first argument into a binary from a string using the provided character set | ||
| * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). | ||
| * If either argument is null, the result will also be null. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def encode(value: Column, charset: Column): Column = Encode(value.expr, charset.expr) | ||
|
|
||
| /** | ||
| * Computes the first argument into a binary from a string using the provided character set | ||
| * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). | ||
| * If either argument is null, the result will also be null. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def encode(columnName: String, charsetColumnName: String): Column = | ||
| encode(Column(columnName), Column(charsetColumnName)) | ||
|
|
||
| /** | ||
| * Computes the first argument into a string from a binary using the provided character set | ||
| * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). | ||
| * If either argument is null, the result will also be null. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def decode(value: Column, charset: Column): Column = Decode(value.expr, charset.expr) | ||
|
|
||
| /** | ||
| * Computes the first argument into a string from a binary using the provided character set | ||
| * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). | ||
| * If either argument is null, the result will also be null. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def decode(columnName: String, charsetColumnName: String): Column = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i am not sure if this makes sense -- since it is more likely users want to decode by typing in the charset, rather than using a column for that...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, in most of existed DF api, we take the string as the column name, should we break this pattern? Actually, it seems redundant for most of DF functions, which take the string columns as parameters, as well as the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's just change this one to take charset: String, rather than a column.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. basically two decode: def decode(column: Column, charset: String): Column
def decode(columnName: String, charset: String): Columnsame for encode
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, I will update it soon. |
||
| decode(Column(columnName), Column(charsetColumnName)) | ||
|
|
||
|
|
||
| ////////////////////////////////////////////////////////////////////////////////////////////// | ||
| ////////////////////////////////////////////////////////////////////////////////////////////// | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what should the behavior be if it is a non-ascii utf8 string?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I copied the logic from Hive, Hive doesn't check if it's a utf8 string.