diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index dce420b32b2..aaccd08f48a 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -124,6 +124,14 @@ struct ARROW_EXPORT TrimOptions : public FunctionOptions { std::string characters; }; +struct ARROW_EXPORT SliceOptions : public FunctionOptions { + explicit SliceOptions(int64_t start, int64_t stop = std::numeric_limits::max(), + int64_t step = 1) + : start(start), stop(stop), step(step) {} + + int64_t start, stop, step; +}; + enum CompareOperator : int8_t { EQUAL, NOT_EQUAL, diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index d939d1c7722..1d87bd86c67 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -138,7 +138,10 @@ struct StringTransform { using offset_type = typename Type::offset_type; using ArrayType = typename TypeTraits::ArrayType; - static int64_t MaxCodeunits(offset_type input_ncodeunits) { return input_ncodeunits; } + virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) { + return input_ncodeunits; + } + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { return Derived().Execute(ctx, batch, out); } @@ -156,7 +159,8 @@ struct StringTransform { offset_type input_ncodeunits = input_boxed.total_values_length(); offset_type input_nstrings = static_cast(input.length); - int64_t output_ncodeunits_max = Derived::MaxCodeunits(input_ncodeunits); + const int64_t output_ncodeunits_max = + MaxCodeunits(input_nstrings, input_ncodeunits); if (output_ncodeunits_max > std::numeric_limits::max()) { return Status::CapacityError( "Result might not fit in a 32bit utf8 array, convert to large_utf8"); @@ -183,35 +187,36 @@ struct StringTransform { output_ncodeunits += encoded_nbytes; output_string_offsets[i + 1] = output_ncodeunits; } + DCHECK_LE(output_ncodeunits, output_ncodeunits_max); // Trim the codepoint buffer, since we allocated too much - RETURN_NOT_OK(values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true)); + return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true); } else { + DCHECK_EQ(batch[0].kind(), Datum::SCALAR); const auto& input = checked_cast(*batch[0].scalar()); - auto result = checked_pointer_cast(MakeNullScalar(out->type())); - if (input.is_valid) { - result->is_valid = true; - offset_type data_nbytes = static_cast(input.value->size()); + if (!input.is_valid) { + return Status::OK(); + } + auto* result = checked_cast(out->scalar().get()); + result->is_valid = true; + offset_type data_nbytes = static_cast(input.value->size()); - int64_t output_ncodeunits_max = Derived::MaxCodeunits(data_nbytes); - if (output_ncodeunits_max > std::numeric_limits::max()) { - return Status::CapacityError( - "Result might not fit in a 32bit utf8 array, convert to large_utf8"); - } - ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(output_ncodeunits_max)); - result->value = value_buffer; - offset_type encoded_nbytes = 0; - if (ARROW_PREDICT_FALSE(!static_cast(*this).Transform( - input.value->data(), data_nbytes, value_buffer->mutable_data(), - &encoded_nbytes))) { - return Derived::InvalidStatus(); - } - RETURN_NOT_OK(value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true)); + int64_t output_ncodeunits_max = MaxCodeunits(1, data_nbytes); + if (output_ncodeunits_max > std::numeric_limits::max()) { + return Status::CapacityError( + "Result might not fit in a 32bit utf8 array, convert to large_utf8"); } - out->value = result; + ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(output_ncodeunits_max)); + result->value = value_buffer; + offset_type encoded_nbytes = 0; + if (ARROW_PREDICT_FALSE(!static_cast(*this).Transform( + input.value->data(), data_nbytes, value_buffer->mutable_data(), + &encoded_nbytes))) { + return Derived::InvalidStatus(); + } + DCHECK_LE(encoded_nbytes, output_ncodeunits_max); + return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true); } - - return Status::OK(); } }; @@ -234,7 +239,8 @@ struct StringTransformCodepoint : StringTransform { *output_written = static_cast(output - output_start); return true; } - static int64_t MaxCodeunits(offset_type input_ncodeunits) { + + int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override { // Section 5.18 of the Unicode spec claim that the number of codepoints for case // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes // However, since we don't support all casings (SpecialCasing.txt) the growth @@ -243,6 +249,7 @@ struct StringTransformCodepoint : StringTransform { // two code units (even) can grow to 3 code units. return static_cast(input_ncodeunits) * 3 / 2; } + Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) { EnsureLookupTablesFilled(); return Base::Execute(ctx, batch, out); @@ -758,6 +765,209 @@ void AddFindSubstring(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } +// Slicing + +template +struct SliceBase : StringTransform { + using Base = StringTransform; + using offset_type = typename Base::offset_type; + using State = OptionsWrapper; + + SliceOptions options; + + explicit SliceBase(SliceOptions options) : options(options) {} + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + SliceOptions options = State::Get(ctx); + if (options.step == 0) { + return Status::Invalid("Slice step cannot be zero"); + } + return Derived(options).Execute(ctx, batch, out); + } +}; + +#define PROPAGATE_FALSE(expr) \ + do { \ + if (ARROW_PREDICT_FALSE(!expr)) { \ + return false; \ + } \ + } while (0) + +bool SliceCodeunitsTransform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output, int64_t* output_written, + const SliceOptions& options) { + const uint8_t* begin = input; + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* begin_sliced = begin; + const uint8_t* end_sliced = end; + + if (options.step >= 1) { + if (options.start >= 0) { + // start counting from the left + PROPAGATE_FALSE( + arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, options.start)); + if (options.stop > options.start) { + // continue counting from begin_sliced + int64_t length = options.stop - options.start; + PROPAGATE_FALSE( + arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length)); + } else if (options.stop < 0) { + // or from the end (but we will never need to < begin_sliced) + PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse( + begin_sliced, end, &end_sliced, -options.stop)); + } else { + // zero length slice + *output_written = 0; + return true; + } + } else { + // start counting from the right + PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(begin, end, &begin_sliced, + -options.start)); + if (options.stop > 0) { + // continue counting from the left, we cannot start from begin_sliced because we + // don't know how many codepoints are between begin and begin_sliced + PROPAGATE_FALSE( + arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, options.stop)); + // and therefore we also needs this + if (end_sliced <= begin_sliced) { + // zero length slice + *output_written = 0; + return true; + } + } else if ((options.stop < 0) && (options.stop > options.start)) { + // stop is negative, but larger than start, so we count again from the right + // in some cases we can optimize this, depending on the shortest path (from end + // or begin_sliced), but begin_sliced and options.start can be 'out of sync', + // for instance when start=-100, when the string length is only 10. + PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse( + begin_sliced, end, &end_sliced, -options.stop)); + } else { + // zero length slice + *output_written = 0; + return true; + } + } + DCHECK(begin_sliced <= end_sliced); + if (options.step == 1) { + // fast case, where we simply can finish with a memcpy + std::copy(begin_sliced, end_sliced, output); + *output_written = end_sliced - begin_sliced; + } else { + uint8_t* dest = output; + const uint8_t* i = begin_sliced; + + while (i < end_sliced) { + uint32_t codepoint = 0; + // write a single codepoint + PROPAGATE_FALSE(arrow::util::UTF8Decode(&i, &codepoint)); + dest = arrow::util::UTF8Encode(dest, codepoint); + // and skip the remainder + int64_t skips = options.step - 1; + while ((skips--) && (i < end_sliced)) { + PROPAGATE_FALSE(arrow::util::UTF8Decode(&i, &codepoint)); + } + } + *output_written = dest - output; + } + return true; + } else { // step < 0 + // serious +1 -1 kung fu because now begin_slice and end_slice act like reverse + // iterators. + + if (options.start >= 0) { + // +1 because begin_sliced acts as as the end of a reverse iterator + PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, + options.start + 1)); + // and make it point at the last codeunit of the previous codeunit + begin_sliced--; + } else { + // -1 because start=-1 means the last codeunit, which is 0 advances + PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(begin, end, &begin_sliced, + -options.start - 1)); + // and make it point at the last codeunit of the previous codeunit + begin_sliced--; + } + // similar to options.start + if (options.stop >= 0) { + PROPAGATE_FALSE( + arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, options.stop + 1)); + end_sliced--; + } else { + PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(begin, end, &end_sliced, + -options.stop - 1)); + end_sliced--; + } + + uint8_t* dest = output; + const uint8_t* i = begin_sliced; + + while (i > end_sliced) { + uint32_t codepoint = 0; + // write a single codepoint + PROPAGATE_FALSE(arrow::util::UTF8DecodeReverse(&i, &codepoint)); + dest = arrow::util::UTF8Encode(dest, codepoint); + // and skip the remainder + int64_t skips = -options.step - 1; + while ((skips--) && (i > end_sliced)) { + PROPAGATE_FALSE(arrow::util::UTF8DecodeReverse(&i, &codepoint)); + } + } + *output_written = dest - output; + return true; + } +} + +#undef PROPAGATE_FALSE + +template +struct SliceCodeunits : SliceBase> { + using Base = SliceBase>; + using offset_type = typename Base::offset_type; + using Base::Base; + + int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override { + const SliceOptions& opt = this->options; + if ((opt.start >= 0) != (opt.stop >= 0)) { + // If start and stop don't have the same sign, we can't guess an upper bound + // on the resulting slice lengths, so return a worst case estimate. + return input_ncodeunits; + } + int64_t max_slice_codepoints = (opt.stop - opt.start + opt.step - 1) / opt.step; + // The maximum UTF8 byte size of a codepoint is 4 + return std::min(input_ncodeunits, + 4 * ninputs * std::max(0, max_slice_codepoints)); + } + + bool Transform(const uint8_t* input, offset_type input_string_ncodeunits, + uint8_t* output, offset_type* output_written) { + int64_t output_written_64; + bool res = SliceCodeunitsTransform(input, input_string_ncodeunits, output, + &output_written_64, this->options); + *output_written = static_cast(output_written_64); + return res; + } +}; + +const FunctionDoc utf8_slice_codeunits_doc( + "Slice string ", + ("For each string in `strings`, slice into a substring defined by\n" + "`start`, `stop`, `step`) as given by `SliceOptions` where `start` is inclusive\n" + "and `stop` is exclusive and are measured in codeunits. If step is negative, the\n" + "string will be advanced in reversed order. A `step` of zero is considered an\n" + "error.\n" + "Null inputs emit null."), + {"strings"}, "SliceOptions"); + +void AddSlice(FunctionRegistry* registry) { + auto func = std::make_shared("utf8_slice_codeunits", Arity::Unary(), + &utf8_slice_codeunits_doc); + using t32 = SliceCodeunits; + using t64 = SliceCodeunits; + DCHECK_OK(func->AddKernel({utf8()}, utf8(), t32::Exec, t32::State::Init)); + DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), t64::Exec, t64::State::Init)); + DCHECK_OK(registry->AddFunction(std::move(func))); +} // IsAlpha/Digit etc #ifdef ARROW_WITH_UTF8PROC @@ -2716,7 +2926,6 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { AddUnaryStringPredicate("utf8_is_upper", registry, &utf8_is_upper_doc); #endif - AddSplit(registry); AddBinaryLength(registry); AddUtf8Length(registry); AddMatchSubstring(registry); @@ -2730,6 +2939,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { MemAllocation::NO_PREALLOCATE); AddExtractRegex(registry); #endif + AddSlice(registry); + AddSplit(registry); AddStrptime(registry); } diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 5c230c41cd9..fe069810dbd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -809,6 +809,118 @@ TYPED_TEST(TestStringKernels, TrimUTF8) { } #endif +// produce test data with e.g.: +// repr([k[-3:1] for k in ["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"]]).replace("'", '"') + +#ifdef ARROW_WITH_UTF8PROC +TYPED_TEST(TestStringKernels, SliceCodeunitsBasic) { + SliceOptions options{2, 4}; + this->CheckUnary("utf8_slice_codeunits", R"(["foo", "fo", null, "foo bar"])", + this->type(), R"(["o", "", null, "o "])", &options); + SliceOptions options_2{2, 3}; + // ensure we slice in codeunits, not graphemes + // a\u0308 is ä, which is 1 grapheme (character), but two codepoints + // \u0308 in utf8 encoding is \xcc\x88 + this->CheckUnary("utf8_slice_codeunits", R"(["ää", "bä"])", this->type(), + "[\"a\", \"\xcc\x88\"]", &options_2); + SliceOptions options_empty_pos{6, 6}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), R"(["", + ""])", + &options_empty_pos); + SliceOptions options_empty_neg{-6, -6}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), R"(["", + ""])", + &options_empty_neg); + SliceOptions options_empty_neg_to_zero{-6, 0}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), R"(["", ""])", + &options_empty_neg_to_zero); + + // end is beyond 0, but before start (hence empty) + SliceOptions options_edgecase_1{-3, 1}; + this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), R"([""])", + &options_edgecase_1); + + // this is a safeguard agains an optimization path possible, but actually a tricky case + SliceOptions options_edgecase_2{-6, -2}; + this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), R"(["𝑓öõ"])", + &options_edgecase_2); + + auto input = ArrayFromJSON(this->type(), R"(["𝑓öõḍš"])"); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Attempted to initialize KernelState from null FunctionOptions"), + CallFunction("utf8_slice_codeunits", {input})); + + SliceOptions options_invalid{2, 4, 0}; + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Slice step cannot be zero"), + CallFunction("utf8_slice_codeunits", {input}, &options_invalid)); +} + +TYPED_TEST(TestStringKernels, SliceCodeunitsPosPos) { + SliceOptions options{2, 4}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "", "õ", "õḍ", "õḍ"])", &options); + SliceOptions options_step{1, 5, 2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "ö", "ö", "öḍ", "öḍ"])", &options_step); + SliceOptions options_step_neg{5, 1, -2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "", "õ", "ḍ", "šõ"])", &options_step_neg); + options_step_neg.stop = 0; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ","𝑓öõḍš"])", + this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", &options_step_neg); +} + +TYPED_TEST(TestStringKernels, SliceCodeunitsPosNeg) { + SliceOptions options{2, -1}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "", "", "õ", "õḍ"])", &options); + SliceOptions options_step{1, -1, 2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "f", "fö", "föo", "föod","foodš"])", + this->type(), R"(["", "", "", "ö", "ö", "od"])", &options_step); + SliceOptions options_step_neg{3, -4, -2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ","𝑓öõḍš"])", + this->type(), R"(["", "𝑓", "ö", "õ𝑓", "ḍö", "ḍ"])", &options_step_neg); + options_step_neg.stop = -5; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ","𝑓öõḍš"])", + this->type(), R"(["", "𝑓", "ö", "õ𝑓", "ḍö", "ḍö"])", + &options_step_neg); +} + +TYPED_TEST(TestStringKernels, SliceCodeunitsNegNeg) { + SliceOptions options{-2, -1}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "𝑓", "ö", "õ", "ḍ"])", &options); + SliceOptions options_step{-4, -1, 2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "𝑓", "𝑓", "𝑓õ", "öḍ"])", &options_step); + SliceOptions options_step_neg{-1, -3, -2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "𝑓", "ö", "õ", "ḍ", "š"])", &options_step_neg); + options_step_neg.stop = -4; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "𝑓", "ö", "õ𝑓", "ḍö", "šõ"])", + &options_step_neg); +} + +TYPED_TEST(TestStringKernels, SliceCodeunitsNegPos) { + SliceOptions options{-2, 4}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "𝑓", "𝑓ö", "öõ", "õḍ", "ḍ"])", &options); + SliceOptions options_step{-4, 4, 2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "𝑓", "𝑓", "𝑓õ", "𝑓õ", "öḍ"])", &options_step); + SliceOptions options_step_neg{-1, 1, -2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "", "õ", "ḍ", "šõ"])", &options_step_neg); + options_step_neg.stop = 0; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", &options_step_neg); +} + +#endif // ARROW_WITH_UTF8PROC + TYPED_TEST(TestStringKernels, TrimWhitespaceAscii) { // \xe2\x80\x88 is punctuation space this->CheckUnary("ascii_trim_whitespace", diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index 310d6913403..1426dc904ee 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -492,6 +492,30 @@ static inline bool UTF8FindIfReverse(const uint8_t* first, const uint8_t* last, return true; } +static inline bool UTF8AdvanceCodepoints(const uint8_t* first, const uint8_t* last, + const uint8_t** destination, int64_t n) { + return UTF8FindIf( + first, last, + [&](uint32_t codepoint) { + bool done = n == 0; + n--; + return done; + }, + destination); +} + +static inline bool UTF8AdvanceCodepointsReverse(const uint8_t* first, const uint8_t* last, + const uint8_t** destination, int64_t n) { + return UTF8FindIfReverse( + first, last, + [&](uint32_t codepoint) { + bool done = n == 0; + n--; + return done; + }, + destination); +} + template static inline bool UTF8ForEach(const uint8_t* first, const uint8_t* last, UnaryFunction&& f) { diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 3cf244ca5e8..7b8205bed8c 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -623,14 +623,14 @@ when a positive ``max_splits`` is given. as separator. -String extraction -~~~~~~~~~~~~~~~~~ +String component extraction +~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+--------------------+------------+------------------------------------+---------------+----------------------------------------+ -| Function name | Arity | Input types | Output type | Options class | -+====================+============+====================================+===============+========================================+ -| extract_regex | Unary | String-like | Struct (1) | :struct:`ExtractRegexOptions` | -+--------------------+------------+------------------------------------+---------------+----------------------------------------+ ++--------------------+------------+----------------+---------------+----------------------------------------+ +| Function name | Arity | Input types | Output type | Options class | ++====================+============+================+===============+========================================+ +| extract_regex | Unary | String-like | Struct (1) | :struct:`ExtractRegexOptions` | ++--------------------+------------+----------------+---------------+----------------------------------------+ * \(1) Extract substrings defined by a regular expression using the Google RE2 library. The output struct field names refer to the named capture groups, @@ -638,6 +638,26 @@ String extraction ``(?P[ab])(?P\\d)``. +Slicing +~~~~~~~ + +These function transform each sequence of the array to a subsequence, according +to start and stop indices, and a non-zero step (defaulting to 1). Slicing +semantics follow Python slicing semantics: the start index is inclusive, +the stop index exclusive; if the step is negative, the sequence is followed +in reverse order. + ++--------------------------+------------+----------------+-----------------+--------------------------+---------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++==========================+============+================+=================+==========================+=========+ +| utf8_slice_codepoints | Unary | String-like | String-like | :struct:`SliceOptions` | \(1) | ++--------------------------+------------+----------------+-----------------+--------------------------+---------+ + +* \(1) Slice string into a substring defined by (``start``, ``stop``, ``step``) + as given by :struct:`SliceOptions` where ``start`` and ``stop`` are measured + in codeunits. Null inputs emit null. + + Structural transforms ~~~~~~~~~~~~~~~~~~~~~ diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 1b62226b2b3..2c8f77e12f9 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -721,6 +721,23 @@ class ExtractRegexOptions(_ExtractRegexOptions): self._set_options(pattern) +cdef class _SliceOptions(FunctionOptions): + cdef: + unique_ptr[CSliceOptions] slice_options + + cdef const CFunctionOptions* get_options(self) except NULL: + return self.slice_options.get() + + def _set_options(self, start, stop, step): + self.slice_options.reset( + new CSliceOptions(start, stop, step)) + + +class SliceOptions(_SliceOptions): + def __init__(self, start, stop, step=1): + self._set_options(start, stop, step) + + cdef class _FilterOptions(FunctionOptions): cdef: unique_ptr[CFilterOptions] filter_options diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index c447aa95c5c..18c730fc11e 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -37,15 +37,16 @@ IndexOptions, MatchSubstringOptions, ModeOptions, - ScalarAggregateOptions, - SplitOptions, - SplitPatternOptions, PartitionNthOptions, ProjectOptions, QuantileOptions, ReplaceSubstringOptions, + ScalarAggregateOptions, SetLookupOptions, + SliceOptions, SortOptions, + SplitOptions, + SplitPatternOptions, StrptimeOptions, TakeOptions, TDigestOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9184bd5bbfd..af00618799e 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1792,6 +1792,13 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CTrimOptions(c_string characters) c_string characters + cdef cppclass CSliceOptions \ + "arrow::compute::SliceOptions"(CFunctionOptions): + CSliceOptions(int64_t start, int64_t stop, int64_t step) + int64_t start + int64_t stop + int64_t step + cdef cppclass CSplitOptions \ "arrow::compute::SplitOptions"(CFunctionOptions): CSplitOptions(int64_t max_splits, c_bool reverse) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index a9a2c0f347d..c013c0d3fe9 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -365,6 +365,18 @@ def test_trim(): assert expected.equals(result) +def test_slice_compatibility(): + arr = pa.array(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"]) + for start in range(-6, 6): + for stop in range(-6, 6): + for step in [-3, -2, -1, 1, 2, 3]: + expected = pa.array([k.as_py()[start:stop:step] + for k in arr]) + result = pc.utf8_slice_codeunits( + arr, start=start, stop=stop, step=step) + assert expected.equals(result) + + def test_split_pattern(): arr = pa.array(["-foo---bar--", "---foo---b"]) result = pc.split_pattern(arr, pattern="---")