-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat:implement sql style 'substr_index' string function #8272
Changes from 10 commits
0b8a3cc
03b8d15
eb206bd
d5be382
5fccd49
407cf4b
b296f19
f3abde6
b5286fa
c0ceb37
89f9e4f
ec5f723
5a34984
33a2db2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -455,3 +455,68 @@ pub fn translate<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { | |
|
||
Ok(Arc::new(result) as ArrayRef) | ||
} | ||
|
||
/// Returns the substring from str before count occurrences of the delimiter delim. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. | ||
/// SUBSTRING_INDEX('www.apache.org', '.', 1) = www | ||
/// SUBSTRING_INDEX('www.apache.org', '.', 2) = www.apache | ||
/// SUBSTRING_INDEX('www.apache.org', '.', -2) = apache.org | ||
/// SUBSTRING_INDEX('www.apache.org', '.', -1) = org | ||
pub fn substr_index<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { | ||
if args.len() != 3 { | ||
return Err(DataFusionError::Internal(format!( | ||
"substr_index function requires three arguments, got {}", | ||
args.len() | ||
))); | ||
} | ||
|
||
let string_array = as_generic_string_array::<T>(&args[0])?; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we need to add a defense check args is exactly 3 elements There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks, I add the args len check |
||
let delimiter_array = as_generic_string_array::<T>(&args[1])?; | ||
let count_array = as_int64_array(&args[2])?; | ||
|
||
let result = string_array | ||
.iter() | ||
.zip(delimiter_array.iter()) | ||
.zip(count_array.iter()) | ||
.map(|((string, delimiter), n)| match (string, delimiter, n) { | ||
(Some(string), Some(delimiter), Some(n)) => { | ||
let mut res = String::new(); | ||
match n { | ||
0 => { | ||
"".to_string(); | ||
} | ||
_other => { | ||
if n > 0 { | ||
let idx = string | ||
.split(delimiter) | ||
.take(n as usize) | ||
.fold(0, |len, x| len + x.len() + delimiter.len()) | ||
- delimiter.len(); | ||
res.push_str(if idx >= string.len() { | ||
string | ||
} else { | ||
&string[..idx] | ||
}); | ||
} else { | ||
let idx = (string.split(delimiter).take((-n) as usize).fold( | ||
string.len() as isize, | ||
|len, x| { | ||
len - x.len() as isize - delimiter.len() as isize | ||
}, | ||
) + delimiter.len() as isize) | ||
as usize; | ||
res.push_str(if idx >= string.len() { | ||
string | ||
} else { | ||
&string[idx..] | ||
}); | ||
} | ||
} | ||
} | ||
Some(res) | ||
} | ||
_ => None, | ||
}) | ||
.collect::<GenericStringArray<T>>(); | ||
|
||
Ok(Arc::new(result) as ArrayRef) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -877,3 +877,78 @@ query ? | |
SELECT levenshtein(NULL, NULL) | ||
---- | ||
NULL | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', '.', 1) | ||
---- | ||
www | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', '.', 2) | ||
---- | ||
www.apache | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', '.', -1) | ||
---- | ||
org | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', '.', -2) | ||
---- | ||
apache.org | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', 'ac', 1) | ||
---- | ||
www.ap | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', 'ac', -1) | ||
---- | ||
he.org | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', 'ac', 2) | ||
---- | ||
www.apache.org | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', 'ac', -2) | ||
---- | ||
www.apache.org | ||
|
||
query ? | ||
SELECT substr_index(NULL, 'ac', 1) | ||
---- | ||
NULL | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', NULL, 1) | ||
---- | ||
NULL | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', 'ac', NULL) | ||
---- | ||
NULL | ||
|
||
query T | ||
SELECT substr_index('', 'ac', 1) | ||
---- | ||
(empty) | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', '', 1) | ||
---- | ||
(empty) | ||
|
||
query T | ||
SELECT substr_index('www.apache.org', 'ac', 0) | ||
---- | ||
(empty) | ||
|
||
query ? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. awesome, can we also have the same tests with empty strings as input and search token? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add empty string tests and 0 count tests |
||
SELECT substr_index(NULL, NULL, NULL) | ||
---- | ||
NULL |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please use
internal_err!
macrosThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks, change to internal_err.