Skip to content

Commit

Permalink
feat(datafusion): implement ops.RegexSplit using pyarrow UDF
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud authored and gforsyth committed Dec 19, 2023
1 parent 9d1295f commit 37b6b7f
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 2 deletions.
1 change: 1 addition & 0 deletions ibis/backends/datafusion/compiler/values.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def translate_val(op, **_):
ops.ArrayContains: "array_contains",
ops.ArrayLength: "array_length",
ops.ArrayRemove: "array_remove_all",
ops.RegexSplit: "regex_split",
}

for _op, _name in _simple_ops.items():
Expand Down
12 changes: 11 additions & 1 deletion ibis/backends/datafusion/udfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import pyarrow.compute as pc
import pyarrow_hotfix # noqa: F401

import ibis.expr.datatypes as dt # noqa: TCH001
import ibis.common.exceptions as com
import ibis.expr.datatypes as dt


def _extract_epoch_seconds(array) -> dt.int32:
Expand Down Expand Up @@ -113,3 +114,12 @@ def extract_minute_timestamp(array: dt.Timestamp(scale=9)) -> dt.int32:

def extract_hour_time(array: dt.time) -> dt.int32:
return pc.cast(pc.hour(array), pa.int32())


def regex_split(s: str, pattern: str) -> list[str]:
patterns = pattern.to_pylist()
if len(patterns) != 1:
raise com.IbisError(
"Only a single scalar pattern is supported for DataFusion re_split"
)
return pc.split_pattern_regex(s, patterns[0])
1 change: 0 additions & 1 deletion ibis/backends/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1103,7 +1103,6 @@ def test_non_match_regex_search_is_false(con):
@pytest.mark.notimpl(
[
"dask",
"datafusion",
"impala",
"mysql",
"sqlite",
Expand Down

0 comments on commit 37b6b7f

Please sign in to comment.