diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 2bc6936d77b..4958ec970c8 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -237,6 +237,15 @@ std::vector GetStringFunctionRegistry() { utf8(), kResultNullIfNull, "substr_utf8_int64", NativeFunction::kNeedsContext), + NativeFunction("substr", {"substring"}, + DataTypeVector{utf8(), int32() /*offset*/, int32() /*length*/}, + utf8(), kResultNullIfNull, "substr_utf8_int32_int32", + NativeFunction::kNeedsContext), + + NativeFunction("substr", {"substring"}, DataTypeVector{utf8(), int32() /*offset*/}, + utf8(), kResultNullIfNull, "substr_utf8_int32", + NativeFunction::kNeedsContext), + NativeFunction("lpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(), kResultNullIfNull, "lpad_utf8_int32_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index c255b9a11c0..e6e14478771 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -821,12 +821,86 @@ const char* substr_utf8_int64_int64(gdv_int64 context, const char* input, return ret; } +FORCE_INLINE +const char* substr_utf8_int32_int32(gdv_int64 context, const char* input, + gdv_int32 in_data_len, gdv_int32 position, + gdv_int32 substring_length, gdv_int32* out_data_len) { + if (substring_length <= 0 || input == nullptr || in_data_len <= 0) { + *out_data_len = 0; + return ""; + } + + gdv_int32 in_glyphs_count = utf8_length(context, input, in_data_len); + + // in_glyphs_count is zero if input has invalid glyphs + if (in_glyphs_count == 0) { + *out_data_len = 0; + return ""; + } + + gdv_int32 from_glyph; // from_glyph==0 indicates the first glyph of the input + if (position > 0) { + from_glyph = position - 1; + } else if (position < 0) { + from_glyph = in_glyphs_count + position; + } else { + from_glyph = 0; + } + + if (from_glyph < 0 || from_glyph >= in_glyphs_count) { + *out_data_len = 0; + return ""; + } + + gdv_int32 out_glyphs_count = substring_length; + if (substring_length > in_glyphs_count - from_glyph) { + out_glyphs_count = in_glyphs_count - from_glyph; + } + + gdv_int32 start_pos = 0; + gdv_int32 end_pos = in_data_len; + + gdv_int32 current_glyph = 0; + gdv_int32 pos = 0; + while (pos < in_data_len) { + if (current_glyph == from_glyph) { + start_pos = pos; + } + pos += utf8_char_length(input[pos]); + if (current_glyph - from_glyph + 1 == out_glyphs_count) { + end_pos = pos; + } + current_glyph++; + } + + if (end_pos > in_data_len || end_pos > INT_MAX) { + end_pos = in_data_len; + } + + *out_data_len = end_pos - start_pos; + char* ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_data_len)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_data_len = 0; + return ""; + } + memcpy(ret, input + start_pos, *out_data_len); + return ret; +} + FORCE_INLINE const char* substr_utf8_int64(gdv_int64 context, const char* input, gdv_int32 in_len, gdv_int64 offset64, gdv_int32* out_len) { return substr_utf8_int64_int64(context, input, in_len, offset64, in_len, out_len); } +FORCE_INLINE +const char* substr_utf8_int32(gdv_int64 context, const char* input, gdv_int32 in_len, + gdv_int32 offset64, gdv_int32* out_len) { + return substr_utf8_int32_int32(context, input, in_len, offset64, in_len, out_len); +} + FORCE_INLINE const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_len, gdv_int32 repeat_number, gdv_int32* out_len) { diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index b84c51b3a6b..baca476637a 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -791,6 +791,76 @@ TEST(TestStringOps, TestSubstring) { EXPECT_FALSE(ctx.has_error()); } +TEST(TestStringOps, TestSubstring) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + + const char* out_str = substr_utf8_int32_int32(ctx_ptr, "asdf", 4, 1, 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "asdf", 4, 1, 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "as"); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "asdf", 4, 1, 5, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "asdf"); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "asdf", 4, 0, 5, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "asdf"); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "asdf", 4, -2, 5, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "df"); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "asdf", 4, -5, 5, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "अपाचे एरो", 25, 1, 5, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "अपाचे"); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "अपाचे एरो", 25, 7, 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "एरो"); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "çåå†", 9, 4, 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "†"); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "çåå†", 9, 2, 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "åå"); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "çåå†", 9, 0, 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "çå"); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "afg", 4, 0, -5, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32_int32(ctx_ptr, "", 0, 5, 5, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32(ctx_ptr, "abcd", 4, 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "bcd"); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int64(ctx_ptr, "abcd", 4, 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "abcd"); + EXPECT_FALSE(ctx.has_error()); + + out_str = substr_utf8_int32(ctx_ptr, "çåå†", 9, 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "åå†"); + EXPECT_FALSE(ctx.has_error()); +} + TEST(TestStringOps, TestSubstringInvalidInputs) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 83bbdee2085..72088482592 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -469,6 +469,11 @@ const char* substr_utf8_int64_int64(gdv_int64 context, const char* input, gdv_int64 length, gdv_int32* out_len); const char* substr_utf8_int64(gdv_int64 context, const char* input, gdv_int32 in_len, gdv_int64 offset64, gdv_int32* out_len); +const char* substr_utf8_int32_int32(gdv_int64 context, const char* input, + gdv_int32 in_len, gdv_int32 offset64, + gdv_int32 length, gdv_int32* out_len); +const char* substr_utf8_int32(gdv_int64 context, const char* input, gdv_int32 in_len, + gdv_int32 offset64, gdv_int32* out_len); const char* concat_utf8_utf8(gdv_int64 context, const char* left, gdv_int32 left_len, bool left_validity, const char* right, gdv_int32 right_len,