From a35122fd831d17b9c12d1570b9e40a86ae3be394 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 8 May 2025 08:05:37 -0600 Subject: [PATCH 1/2] Add Spark-compatible char expression --- datafusion/spark/src/function/string/chr.rs | 130 ++++++++++++++++++++ datafusion/spark/src/function/string/mod.rs | 7 ++ 2 files changed, 137 insertions(+) create mode 100644 datafusion/spark/src/function/string/chr.rs diff --git a/datafusion/spark/src/function/string/chr.rs b/datafusion/spark/src/function/string/chr.rs new file mode 100644 index 000000000000..dd6cdc83b30d --- /dev/null +++ b/datafusion/spark/src/function/string/chr.rs @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{any::Any, sync::Arc}; + +use arrow::{ + array::{ArrayRef, StringArray}, + datatypes::{ + DataType, + DataType::{Int64, Utf8}, + }, +}; + +use datafusion_common::{cast::as_int64_array, exec_err, Result, ScalarValue}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; + +/// Spark-compatible `char` expression +/// +#[derive(Debug)] +pub struct SparkChar { + signature: Signature, +} + +impl Default for SparkChar { + fn default() -> Self { + Self::new() + } +} + +impl SparkChar { + pub fn new() -> Self { + Self { + signature: Signature::uniform(1, vec![Int64], Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkChar { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "char" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Utf8) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + spark_chr(&args.args) + } +} + +/// Returns the ASCII character having the binary equivalent to the input expression. +/// E.g., chr(65) = 'A'. +/// Compatible with Apache Spark's Chr function +fn spark_chr(args: &[ColumnarValue]) -> Result { + let array = args[0].clone(); + match array { + ColumnarValue::Array(array) => { + let array = chr(&[array])?; + Ok(ColumnarValue::Array(array)) + } + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) => { + if value < 0 { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "".to_string(), + )))) + } else { + match core::char::from_u32((value % 256) as u32) { + Some(ch) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some( + ch.to_string(), + )))), + None => { + exec_err!("requested character was incompatible for encoding.") + } + } + } + } + _ => exec_err!("The argument must be an Int64 array or scalar."), + } +} + +fn chr(args: &[ArrayRef]) -> Result { + let integer_array = as_int64_array(&args[0])?; + + // first map is the iterator, second is for the `Option<_>` + let result = integer_array + .iter() + .map(|integer: Option| { + integer + .map(|integer| { + if integer < 0 { + return Ok("".to_string()); // Return empty string for negative integers + } + match core::char::from_u32((integer % 256) as u32) { + Some(ch) => Ok(ch.to_string()), + None => { + exec_err!("requested character not compatible for encoding.") + } + } + }) + .transpose() + }) + .collect::>()?; + + Ok(Arc::new(result) as ArrayRef) +} diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs index c01b6c45c008..c1197831144c 100644 --- a/datafusion/spark/src/function/string/mod.rs +++ b/datafusion/spark/src/function/string/mod.rs @@ -16,12 +16,14 @@ // under the License. pub mod ascii; +pub mod chr; use datafusion_expr::ScalarUDF; use datafusion_functions::make_udf_function; use std::sync::Arc; make_udf_function!(ascii::SparkAscii, ascii); +make_udf_function!(chr::SparkChar, chr); pub mod expr_fn { use datafusion_functions::export_functions; @@ -31,6 +33,11 @@ pub mod expr_fn { "Returns the ASCII code point of the first character of string.", arg1 )); + export_functions!(( + chr, + "Returns the ASCII character having the binary equivalent to col. If col is larger than 256 the result is equivalent to char(col % 256).", + arg1 + )); } pub fn functions() -> Vec> { From ff5a4cd081f9a5b467f66c75c9036c19435f32e7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 8 May 2025 08:17:54 -0600 Subject: [PATCH 2/2] Add slt test --- .../spark/src/function/string/{chr.rs => char.rs} | 0 datafusion/spark/src/function/string/mod.rs | 8 ++++---- .../sqllogictest/test_files/spark/string/char.slt | Bin 0 -> 970 bytes 3 files changed, 4 insertions(+), 4 deletions(-) rename datafusion/spark/src/function/string/{chr.rs => char.rs} (100%) create mode 100644 datafusion/sqllogictest/test_files/spark/string/char.slt diff --git a/datafusion/spark/src/function/string/chr.rs b/datafusion/spark/src/function/string/char.rs similarity index 100% rename from datafusion/spark/src/function/string/chr.rs rename to datafusion/spark/src/function/string/char.rs diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs index c1197831144c..9d5fabe832e9 100644 --- a/datafusion/spark/src/function/string/mod.rs +++ b/datafusion/spark/src/function/string/mod.rs @@ -16,14 +16,14 @@ // under the License. pub mod ascii; -pub mod chr; +pub mod char; use datafusion_expr::ScalarUDF; use datafusion_functions::make_udf_function; use std::sync::Arc; make_udf_function!(ascii::SparkAscii, ascii); -make_udf_function!(chr::SparkChar, chr); +make_udf_function!(char::SparkChar, char); pub mod expr_fn { use datafusion_functions::export_functions; @@ -34,12 +34,12 @@ pub mod expr_fn { arg1 )); export_functions!(( - chr, + char, "Returns the ASCII character having the binary equivalent to col. If col is larger than 256 the result is equivalent to char(col % 256).", arg1 )); } pub fn functions() -> Vec> { - vec![ascii()] + vec![ascii(), char()] } diff --git a/datafusion/sqllogictest/test_files/spark/string/char.slt b/datafusion/sqllogictest/test_files/spark/string/char.slt new file mode 100644 index 0000000000000000000000000000000000000000..d8fc11f6d51270db219e81fbc844de6cb5512528 GIT binary patch literal 970 zcma)4O^=%}5bc>?G0I+ul#lIJ?IC-j*+fN6i4@4L_UOQbSPeF7o3i=$eI`&vY7gZC zHuL6v%-qpN=Bz9iwa$YDHm`uE&HU8`6q{JJN0n#*ZDi5*Je1QDeO=X!gB{-*c`Vv_e)@pQTW_2lvi+h#CT4A;9<0 z2PaT2Iv=EuT}C|@2^RoObaJj<%;#;}&O|qynKtFTkJ-*Q2sBAoQ^@u8dv#`O5z$*C z4RSqPNYoIMivuK{MeB#w&88bU*js~?RXG9J4pYCK%yr;CsX-YG-xeS%p@1GmDaGlC zzD8-BPUt1hzHRpzy+pfRlw|QLrR|QE+vF+E;%$P{nxf9*A hgu?r41AGmX`25p%jGZPIia|%nRqZb0-~$vz!EegCH;n)Q literal 0 HcmV?d00001