diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 2eeac71c727..88c91a18818 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -64,6 +64,22 @@ struct BinaryLength { } }; +struct Utf8Length { + template + static OutValue Call(KernelContext*, Arg0Value val) { + auto str = reinterpret_cast(val.data()); + auto strlen = val.size(); + + OutValue length = 0; + while (strlen > 0) { + length += ((*str & 0xc0) != 0x80); + ++str; + --strlen; + } + return length; + } +}; + #ifdef ARROW_WITH_UTF8PROC // Direct lookup tables for unicode properties @@ -1569,9 +1585,14 @@ const FunctionDoc strptime_doc( const FunctionDoc binary_length_doc( "Compute string lengths", - ("For each string in `strings`, emit its length. Null values emit null."), + ("For each string in `strings`, emit the number of bytes. Null values emit null."), {"strings"}); +const FunctionDoc utf8_length_doc("Compute UTF8 string lengths", + ("For each string in `strings`, emit the number of " + "UTF8 characters. Null values emit null."), + {"strings"}); + void AddStrptime(FunctionRegistry* registry) { auto func = std::make_shared("strptime", Arity::Unary(), &strptime_doc); DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve), @@ -1597,6 +1618,21 @@ void AddBinaryLength(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } +void AddUtf8Length(FunctionRegistry* registry) { + auto func = + std::make_shared("utf8_length", Arity::Unary(), &utf8_length_doc); + + ArrayKernelExec exec_offset_32 = + applicator::ScalarUnaryNotNull::Exec; + DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32))); + + ArrayKernelExec exec_offset_64 = + applicator::ScalarUnaryNotNull::Exec; + DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64))); + + DCHECK_OK(registry->AddFunction(std::move(func))); +} + template