From ca8d8deb96d4fa44646f8b44bc99650b5946a48a Mon Sep 17 00:00:00 2001 From: Paul Guyot Date: Fri, 11 Aug 2023 07:41:32 +0200 Subject: [PATCH] Add missing unicode module Implement `unicode:characters_to_list/1,2` and `unicode:characters_to_binary/1,2,3` using new `interop_iolist_fold`. This fixes `io_lib:format/2` with `t` modifier. Rename `interop_iolist_fold` to `interop_chardata_fold` because it really processes `iodata` and now `chardata` as it works for unicode. Signed-off-by: Paul Guyot --- .clang-format-ignore | 1 - CHANGELOG.md | 4 +- libs/estdlib/src/CMakeLists.txt | 1 + libs/estdlib/src/unicode.erl | 129 ++++++++++++ src/libAtomVM/bitstring.c | 42 ++-- src/libAtomVM/bitstring.h | 19 +- src/libAtomVM/defaultatoms.c | 4 + src/libAtomVM/defaultatoms.h | 6 +- src/libAtomVM/interop.c | 307 +++++++++++++++++++++++++++- src/libAtomVM/interop.h | 24 ++- src/libAtomVM/nifs.c | 124 +++++++++++ src/libAtomVM/nifs.gperf | 5 + src/libAtomVM/opcodesswitch.h | 4 +- tests/erlang_tests/CMakeLists.txt | 2 + tests/erlang_tests/test_unicode.erl | 147 +++++++++++++ tests/libs/estdlib/test_io_lib.erl | 4 +- tests/test.c | 1 + 17 files changed, 789 insertions(+), 35 deletions(-) create mode 100644 libs/estdlib/src/unicode.erl create mode 100644 tests/erlang_tests/test_unicode.erl diff --git a/.clang-format-ignore b/.clang-format-ignore index 5c07b9fbd..61dbe32ae 100644 --- a/.clang-format-ignore +++ b/.clang-format-ignore @@ -7,7 +7,6 @@ # We have a number of existing files that are quite "re-format unfriendly" # Let's ignore all of them src/libAtomVM/bif.c -src/libAtomVM/bitstring.c src/libAtomVM/bitstring.h src/libAtomVM/debug.h src/libAtomVM/defaultatoms.c diff --git a/CHANGELOG.md b/CHANGELOG.md index 3151349c8..56a6d445f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,7 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added support for Erlang `gpio:close/1` and Elixir `GPIO.close/1` for ESP32 - Added support for the Erlang `gen_event` module - Added `start_link` support for the `network` module -- Added support for `erlang:monotomic_time/1` +- Added support for `erlang:monotonic_time/1` - Added `start_link` support for the `gen_statem` module - Added support for serializing floats in erlang external term encoding - Added support for the `SMALL_BIG_EXT` erlang external term encoding @@ -56,6 +56,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `esp:partition_list/0` function - Added `esp:nvs_fetch_binary/2` and `nvs_put_binary/3` functions (`esp:nvs_set_binary` and functions that default to `?ATOMVM_NVS_NS` are deprecated now). +- Added most format possibilities to `io:format/2` and `io_lib:format/2` +- Added `unicode` module with `characters_to_list/1,2` and `characters_to_binary/1,2,3` functions ### Fixed - Fixed issue with formatting integers with io:format() on STM32 platform diff --git a/libs/estdlib/src/CMakeLists.txt b/libs/estdlib/src/CMakeLists.txt index 57f969eb4..211f2e3cc 100644 --- a/libs/estdlib/src/CMakeLists.txt +++ b/libs/estdlib/src/CMakeLists.txt @@ -42,6 +42,7 @@ set(ERLANG_MODULES proplists string timer + unicode erlang ) diff --git a/libs/estdlib/src/unicode.erl b/libs/estdlib/src/unicode.erl new file mode 100644 index 000000000..4ca2d939e --- /dev/null +++ b/libs/estdlib/src/unicode.erl @@ -0,0 +1,129 @@ +% +% This file is part of AtomVM. +% +% Copyright 2023 Paul Guyot +% +% Licensed under the Apache License, Version 2.0 (the "License"); +% you may not use this file except in compliance with the License. +% You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +% See the License for the specific language governing permissions and +% limitations under the License. +% +% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later +% + +%%----------------------------------------------------------------------------- +%% @doc An implementation of the Erlang/OTP unicode interface. +%% +%% This module implements a strict subset of the Erlang/OTP unicode +%% interface. +%% @end +%%----------------------------------------------------------------------------- +-module(unicode). + +-export([ + characters_to_list/1, + characters_to_list/2, + characters_to_binary/1, + characters_to_binary/2, + characters_to_binary/3 +]). + +%% A UTF-8 encoded binary. +-type unicode_binary() :: binary(). + +%% Latin-1 encoded data +-type latin1_chardata() :: iodata(). + +%% Unicode or UTF-8 encoded data +-type chardata() :: charlist() | unicode_binary(). +-type charlist() :: maybe_improper_list( + char() | unicode_binary() | charlist(), unicode_binary() | [] +). + +-type encoding() :: utf8 | latin1. + +-export_type([ + unicode_binary/0, + latin1_chardata/0, + chardata/0, + charlist/0, + encoding/0 +]). + +%% @doc Convert UTF-8 data to a list of Unicode characters. +%%

If conversion fails, the function returns a tuple with three elements:

+%%
    +%%
  • First element is error or incomplete. incomplete means the conversion failed because of an incomplete unicode transform at the very end of data.
  • +%%
  • Second element is what has been converted so far.
  • +%%
  • Third element is the remaining data to be converted, for debugging purposes. This remaining data can differ with what Erlang/OTP returns.
  • +%%
+%% @param Data data to convert to Unicode +%% @return a list of characters or a tuple if conversion failed. +-spec characters_to_list(Data :: chardata() | latin1_chardata()) -> + list() | {error, list(), chardata() | latin1_chardata()} | {incomplete, list(), binary()}. +characters_to_list(_Data) -> + erlang:nif_error(undefined). + +%% @doc Convert UTF-8 or Latin1 data to a list of Unicode characters. +%% @see characters_to_list/1 +%% @param Data data to convert +%% @param Encoding encoding of data to convert +%% @return a list of characters or a tuple if conversion failed. +-spec characters_to_list(Data :: chardata() | latin1_chardata(), Encoding :: encoding()) -> + list() + | {error, list(), chardata() | latin1_chardata()} + | {incomplete, list(), chardata() | latin1_chardata()}. +characters_to_list(_Data, _Encoding) -> + erlang:nif_error(undefined). + +%% @doc Convert character data to an UTF8 binary +%% @equiv characters_to_binary(Data, utf8, utf8) +%% @param Data data to convert to UTF8 +%% @return an utf8 binary or a tuple if conversion failed. +-spec characters_to_binary(Data :: chardata() | latin1_chardata()) -> + unicode_binary() + | {error, list(), chardata() | latin1_chardata()} + | {incomplete, unicode_binary(), chardata() | latin1_chardata()}. +characters_to_binary(_Data) -> + erlang:nif_error(undefined). + +%% @doc Convert character data in a given encoding to an UTF8 binary +%% @equiv characters_to_binary(Data, InEncoding, utf8) +%% @param Data data to convert to UTF8 +%% @param InEncoding encoding of data +%% @return an utf8 binary or a tuple if conversion failed. +-spec characters_to_binary(Data :: chardata() | latin1_chardata(), InEncoding :: encoding()) -> + unicode_binary() + | {error, list(), chardata() | latin1_chardata()} + | {incomplete, unicode_binary(), chardata() | latin1_chardata()}. +characters_to_binary(_Data, _InEncoding) -> + erlang:nif_error(undefined). + +%% @doc Convert character data in a given encoding to a binary in a given encoding. +%%

If conversion fails, the function returns a tuple with three elements:

+%%
    +%%
  • First element is error or incomplete. incomplete means the conversion failed because of an incomplete unicode transform at the very end of data.
  • +%%
  • Second element is what has been converted so far.
  • +%%
  • Third element is the remaining data to be converted, for debugging purposes. This remaining data can differ with what Erlang/OTP returns.
  • +%%
+%%

Also, Erlang/OTP's implementation may error with badarg for parameters +%% for which this function merely returns an error tuple.

+%% @param Data data to convert to UTF8 +%% @param InEncoding encoding of input data +%% @param InEncoding output encoding +%% @return an encoded binary or a tuple if conversion failed. +-spec characters_to_binary( + Data :: chardata() | latin1_chardata(), InEncoding :: encoding(), OutEncoding :: encoding() +) -> + unicode_binary() + | {error, list(), chardata() | latin1_chardata()} + | {incomplete, unicode_binary(), chardata() | latin1_chardata()}. +characters_to_binary(_Data, _InEncoding, _OutEncoding) -> + erlang:nif_error(undefined). diff --git a/src/libAtomVM/bitstring.c b/src/libAtomVM/bitstring.c index b5beb4f3d..307862848 100644 --- a/src/libAtomVM/bitstring.c +++ b/src/libAtomVM/bitstring.c @@ -23,9 +23,7 @@ static inline uint64_t from_le64(uint64_t value) { - return ((((value) & 0xFF) << 56) | (((value) & 0xFF00) << 40) | (((value) & 0xFF0000) << 24) | \ - (((value) & 0xFF000000) << 8) | (((value) & 0xFF00000000) >> 8) | (((value) & 0xFF0000000000) >> 24) | \ - (((value) & 0xFF000000000000) >> 40) | (((value) & 0xFF00000000000000) >> 56)); + return ((((value) &0xFF) << 56) | (((value) &0xFF00) << 40) | (((value) &0xFF0000) << 24) | (((value) &0xFF000000) << 8) | (((value) &0xFF00000000) >> 8) | (((value) &0xFF0000000000) >> 24) | (((value) &0xFF000000000000) >> 40) | (((value) &0xFF00000000000000) >> 56)); } bool bitstring_extract_any_integer(const uint8_t *src, size_t offset, avm_int_t n, @@ -140,12 +138,12 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size) return true; } -bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size) +enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size) { if (len == 0) { - return false; + return UnicodeTransformDecodeFail; } else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) { - int32_t v = 0; + uint32_t v = 0; v |= (buf[0] & 0x07) << 18; v |= (buf[1] & 0x3F) << 12; v |= (buf[2] & 0x3F) << 6; @@ -156,9 +154,9 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o } *c = v; *out_size = 4; - return true; + return UnicodeTransformDecodeSuccess; } else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) { - int32_t v = 0; + uint32_t v = 0; v |= (buf[0] & 0x0F) << 12; v |= (buf[1] & 0x3F) << 6; v |= (buf[2] & 0x3F); @@ -168,9 +166,9 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o } *c = v; *out_size = 3; - return true; + return UnicodeTransformDecodeSuccess; } else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) { - int32_t v = 0; + uint32_t v = 0; v |= (buf[0] & 0x1F) << 6; v |= (buf[1] & 0x3F); // overlong encoding @@ -179,16 +177,28 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o } *c = v; *out_size = 2; - return true; + return UnicodeTransformDecodeSuccess; } else if ((*buf & 0x80) == 0) { - int32_t v = 0; + uint32_t v = 0; v |= (buf[0] & 0x7F); *c = v; *out_size = 1; - return true; + return UnicodeTransformDecodeSuccess; + } else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) { + return UnicodeTransformDecodeIncomplete; + } else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) { + return UnicodeTransformDecodeIncomplete; + } else if (len == 1 && (buf[0] & 0xF8) == 0xF0) { + return UnicodeTransformDecodeIncomplete; + } else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) { + return UnicodeTransformDecodeIncomplete; + } else if (len == 1 && (buf[0] & 0xF0) == 0xE0) { + return UnicodeTransformDecodeIncomplete; + } else if (len == 1 && (buf[0] & 0xE0) == 0xC0) { + return UnicodeTransformDecodeIncomplete; } - return false; + return UnicodeTransformDecodeFail; } // UTF-16 encoding, when U in U+010000 to U+10FFFF: @@ -321,7 +331,7 @@ bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum Bit v |= (buf[3] & 0xFF) << 24; v |= (buf[2] & 0xFF) << 16; v |= (buf[1] & 0xFF) << 8; - v |= buf[0] & 0xFF; + v |= buf[0] & 0xFF; if (is_invalid_codepoint(v)) { return false; } @@ -332,7 +342,7 @@ bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum Bit v |= (buf[0] & 0xFF) << 24; v |= (buf[1] & 0xFF) << 16; v |= (buf[2] & 0xFF) << 8; - v |= buf[3] & 0xFF; + v |= buf[3] & 0xFF; if (is_invalid_codepoint(v)) { return false; } diff --git a/src/libAtomVM/bitstring.h b/src/libAtomVM/bitstring.h index 60bb7baeb..385c4e343 100644 --- a/src/libAtomVM/bitstring.h +++ b/src/libAtomVM/bitstring.h @@ -99,6 +99,13 @@ enum BitstringFlags #endif }; +enum UnicodeTransformDecodeResult +{ + UnicodeTransformDecodeSuccess, + UnicodeTransformDecodeFail, + UnicodeTransformDecodeIncomplete +}; + union maybe_unsigned_int8 { uint8_t u; @@ -320,10 +327,12 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size); * @param len the length (in bytes) of the bytes in buf * @param c int value to decode to or NULL to only compute the size. * @param out_size the size in bytes, on output (if not NULL) - * @return \c true if decoding was successful, \c false if character starting at buf is not a valid - * unicode character + * @return \c UnicodeTransformDecodeSuccess if decoding was successful, + * \c UnicodeTransformDecodeFail if character starting at buf is not a valid + * unicode character or \c UnicodeTransformDecodeIncomplete if character + * starting at buf is a valid but incomplete transformation */ -bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size); +enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size); /** * @brief Encode a character to UTF-16. @@ -428,11 +437,11 @@ static inline bool bitstring_insert_utf8(term dst_bin, size_t offset, avm_int_t * @return \c true if encoding was successful, \c false if src_bin at offset is not a valid * unicode character */ -static inline bool bitstring_match_utf8(term src_bin, size_t offset, int32_t *c, size_t *out_size) +static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c, size_t *out_size) { size_t byte_offset = offset >> 3; // divide by 8 const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset; - return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size); + return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess; } /** diff --git a/src/libAtomVM/defaultatoms.c b/src/libAtomVM/defaultatoms.c index d07df7a0a..5981644b8 100644 --- a/src/libAtomVM/defaultatoms.c +++ b/src/libAtomVM/defaultatoms.c @@ -143,6 +143,8 @@ static const char *const attributes_atom = "\xA" "attributes"; static const char *const compile_atom = "\x7" "compile"; static const char *const exports_atom = "\x7" "exports"; +static const char *const incomplete_atom = "\xA" "incomplete"; + void defaultatoms_init(GlobalContext *glb) { int ok = 1; @@ -270,6 +272,8 @@ void defaultatoms_init(GlobalContext *glb) ok &= globalcontext_insert_atom(glb, compile_atom) == COMPILE_ATOM_INDEX; ok &= globalcontext_insert_atom(glb, exports_atom) == EXPORTS_ATOM_INDEX; + ok &= globalcontext_insert_atom(glb, incomplete_atom) == INCOMPLETE_ATOM_INDEX; + if (!ok) { AVM_ABORT(); } diff --git a/src/libAtomVM/defaultatoms.h b/src/libAtomVM/defaultatoms.h index a30f81624..1bf6a1198 100644 --- a/src/libAtomVM/defaultatoms.h +++ b/src/libAtomVM/defaultatoms.h @@ -152,7 +152,9 @@ extern "C" { #define COMPILE_ATOM_INDEX 97 #define EXPORTS_ATOM_INDEX 98 -#define PLATFORM_ATOMS_BASE_INDEX 99 +#define INCOMPLETE_ATOM_INDEX 99 + +#define PLATFORM_ATOMS_BASE_INDEX 100 #define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX) #define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX) @@ -279,6 +281,8 @@ extern "C" { #define COMPILE_ATOM TERM_FROM_ATOM_INDEX(COMPILE_ATOM_INDEX) #define EXPORTS_ATOM TERM_FROM_ATOM_INDEX(EXPORTS_ATOM_INDEX) +#define INCOMPLETE_ATOM TERM_FROM_ATOM_INDEX(INCOMPLETE_ATOM_INDEX) + void defaultatoms_init(GlobalContext *glb); void platform_defaultatoms_init(GlobalContext *glb); diff --git a/src/libAtomVM/interop.c b/src/libAtomVM/interop.c index b04b00281..f75261b44 100644 --- a/src/libAtomVM/interop.c +++ b/src/libAtomVM/interop.c @@ -20,9 +20,13 @@ #include "interop.h" +#include "bitstring.h" #include "defaultatoms.h" #include "tempstack.h" +#include "term.h" +#include "term_typedef.h" #include "valueshashtable.h" +#include char *interop_term_to_string(term t, int *ok) { @@ -176,7 +180,7 @@ term interop_proplist_get_value_default(term list, term key, term default_value) return default_value; } -inline InteropFunctionResult interop_iolist_fold(term t, interop_iolist_fold_fun fold_fun, void *accum) +inline InteropFunctionResult interop_chardata_fold(term t, interop_chardata_fold_fun fold_fun, interop_chardata_rest_fun rest_fun, void *accum) { if (term_is_binary(t)) { return fold_fun(t, accum); @@ -200,6 +204,15 @@ inline InteropFunctionResult interop_iolist_fold(term t, interop_iolist_fold_fun if (term_is_integer(t) || term_is_binary(t)) { InteropFunctionResult result = fold_fun(t, accum); if (UNLIKELY(result != InteropOk)) { + if (rest_fun) { + // we don't pass failed element, fold_fun handles it + t = temp_stack_pop(&temp_stack); + while (!temp_stack_is_empty(&temp_stack)) { + rest_fun(t, accum); + t = temp_stack_pop(&temp_stack); + } + } + // we don't process last element either which is the original list temp_stack_destroy(&temp_stack); return result; } else { @@ -217,6 +230,13 @@ inline InteropFunctionResult interop_iolist_fold(term t, interop_iolist_fold_fun t = term_get_list_head(t); } else { + if (rest_fun) { + while (!temp_stack_is_empty(&temp_stack)) { + rest_fun(t, accum); + t = temp_stack_pop(&temp_stack); + } + // we don't process last element which was the passed term + } temp_stack_destroy(&temp_stack); return InteropBadArg; } @@ -232,7 +252,7 @@ static inline InteropFunctionResult size_fold_fun(term t, void *accum) size_t *size = (size_t *) accum; if (term_is_integer(t)) { *size += 1; - } else if (term_is_binary(t)) { + } else /* term_is_binary(t) */ { *size += term_binary_size(t); } return InteropOk; @@ -241,7 +261,7 @@ static inline InteropFunctionResult size_fold_fun(term t, void *accum) InteropFunctionResult interop_iolist_size(term t, size_t *size) { *size = 0; - return interop_iolist_fold(t, size_fold_fun, size); + return interop_chardata_fold(t, size_fold_fun, NULL, size); } static inline InteropFunctionResult write_string_fold_fun(term t, void *accum) @@ -250,7 +270,7 @@ static inline InteropFunctionResult write_string_fold_fun(term t, void *accum) if (term_is_integer(t)) { **p = term_to_int(t); (*p)++; - } else if (term_is_binary(t)) { + } else /* term_is_binary(t) */ { int len = term_binary_size(t); memcpy(*p, term_binary_data(t), len); *p += len; @@ -260,7 +280,284 @@ static inline InteropFunctionResult write_string_fold_fun(term t, void *accum) InteropFunctionResult interop_write_iolist(term t, char *p) { - return interop_iolist_fold(t, write_string_fold_fun, (void *) &p); + return interop_chardata_fold(t, write_string_fold_fun, NULL, (void *) &p); +} + +static enum UnicodeConversionResult interop_binary_conversion(term t, uint8_t *output, size_t *output_len, size_t *rest_crsr, enum CharDataEncoding in_encoding, enum CharDataEncoding out_encoding) +{ + size_t len = term_binary_size(t); + if (in_encoding == Latin1Encoding && out_encoding == Latin1Encoding) { + if (output) { + memcpy(output, term_binary_data(t), len); + } + *output_len = len; + return UnicodeOk; + } + size_t result = 0; + size_t input_index; + const uint8_t *input = (const uint8_t *) term_binary_data(t); + if (in_encoding == Latin1Encoding) { + for (input_index = 0; input_index < len; input_index++) { + if (out_encoding == UTF8Encoding) { + size_t char_size; + if (UNLIKELY(!bitstring_utf8_encode(input[input_index], output, &char_size))) { + *rest_crsr = input_index; + *output_len = result; + return UnicodeError; + } + result += char_size; + if (output) { + output += char_size; + } + } else { + // UCS4Native + result += sizeof(uint32_t); + if (output) { + *((uint32_t *) output) = input[input_index]; + output += sizeof(uint32_t); + } + } + } + *output_len = result; + return UnicodeOk; + } + input_index = 0; + while (input_index < len) { + size_t char_size; + uint32_t c; + enum UnicodeTransformDecodeResult decode_result = bitstring_utf8_decode(input + input_index, len - input_index, &c, &char_size); + if (UNLIKELY(decode_result != UnicodeTransformDecodeSuccess)) { + *rest_crsr = input_index; + *output_len = result; + return decode_result == UnicodeTransformDecodeIncomplete ? UnicodeIncompleteTransform : UnicodeError; + } + switch (out_encoding) { + case Latin1Encoding: { + if (c > 255) { + *rest_crsr = input_index; + *output_len = result; + return UnicodeError; + } + if (output) { + *output++ = c; + } + result++; + } break; + case UTF8Encoding: { + if (output) { + memcpy(output, input + input_index, char_size); + output += char_size; + } + result += char_size; + } break; + case UCS4NativeEncoding: { + if (output) { + *((uint32_t *) output) = c; + output += sizeof(uint32_t); + } + result += sizeof(uint32_t); + } break; + } + input_index += char_size; + } + *output_len = result; + return UnicodeOk; +} + +struct CharDataToBytesSizeAcc +{ + enum CharDataEncoding in_encoding; + enum CharDataEncoding out_encoding; + size_t size; + size_t rest_size; + bool badarg; + bool incomplete_transform; +}; + +static InteropFunctionResult chardata_to_bytes_size_fold_fun(term t, void *accum) +{ + struct CharDataToBytesSizeAcc *acc = (struct CharDataToBytesSizeAcc *) accum; + if (term_is_binary(t)) { + size_t bin_size; + size_t rest_crsr; + enum UnicodeConversionResult conv_result = interop_binary_conversion(t, NULL, &bin_size, &rest_crsr, acc->in_encoding, acc->out_encoding); + acc->size += bin_size; + if (UNLIKELY(conv_result != UnicodeOk)) { + acc->rest_size = term_sub_binary_heap_size(t, term_binary_size(t) - rest_crsr); + acc->incomplete_transform = conv_result == UnicodeIncompleteTransform; + return InteropBadArg; + } + } else /* term_is_integer(t) */ { + avm_int_t c = term_to_int(t); + if (c < 0) { + return InteropBadArg; + } + switch (acc->out_encoding) { + case Latin1Encoding: { + if (c > 255) { + return InteropBadArg; + } + acc->size++; + } break; + case UTF8Encoding: { + size_t char_size; + if (UNLIKELY(!bitstring_utf8_encode(c, NULL, &char_size))) { + return InteropBadArg; + } + acc->size += char_size; + } break; + case UCS4NativeEncoding: { + acc->size += sizeof(uint32_t); + } break; + } + } + return InteropOk; +} + +static void chardata_to_bytes_size_rest_fun(term t, void *accum) +{ + struct CharDataToBytesSizeAcc *acc = (struct CharDataToBytesSizeAcc *) accum; + if (!term_is_binary(t) && !term_is_integer(t) && !term_is_list(t)) { + acc->badarg = true; + } + if (!acc->badarg) { + if (!term_is_nil(t)) { + acc->incomplete_transform = false; + } + acc->rest_size += CONS_SIZE; + } +} + +enum UnicodeConversionResult interop_chardata_to_bytes_size(term t, size_t *size, size_t *rest_size, enum CharDataEncoding in_encoding, enum CharDataEncoding out_encoding) +{ + struct CharDataToBytesSizeAcc acc = { + .in_encoding = in_encoding, + .out_encoding = out_encoding, + .size = 0, + .rest_size = 0, + .badarg = false, + .incomplete_transform = false + }; + InteropFunctionResult res = interop_chardata_fold(t, chardata_to_bytes_size_fold_fun, chardata_to_bytes_size_rest_fun, &acc); + if (UNLIKELY(res == InteropMemoryAllocFail)) { + return UnicodeMemoryAllocFail; + } + if (acc.badarg) { + return UnicodeBadArg; + } + *size = acc.size; + if (rest_size) { + *rest_size = acc.rest_size; + } + if (acc.incomplete_transform) { + return UnicodeIncompleteTransform; + } + return res == InteropOk ? UnicodeOk : UnicodeError; +} + +struct CharDataToBytesAcc +{ + enum CharDataEncoding in_encoding; + enum CharDataEncoding out_encoding; + uint8_t *output; + term *rest; + Heap *heap; + bool badarg; + bool incomplete_transform; +}; + +static InteropFunctionResult chardata_to_bytes_fold_fun(term t, void *accum) +{ + struct CharDataToBytesAcc *acc = (struct CharDataToBytesAcc *) accum; + if (term_is_binary(t)) { + size_t bin_size; + size_t rest_crsr; + enum UnicodeConversionResult conv_result = interop_binary_conversion(t, acc->output, &bin_size, &rest_crsr, acc->in_encoding, acc->out_encoding); + acc->output += bin_size; + if (UNLIKELY(conv_result != UnicodeOk)) { + if (acc->rest) { + *acc->rest = term_alloc_sub_binary(t, rest_crsr, term_binary_size(t) - rest_crsr, acc->heap); + } + if (conv_result == UnicodeIncompleteTransform) { + acc->incomplete_transform = true; + } + return InteropBadArg; + } + } else /* term_is_integer(t) */ { + avm_int_t c = term_to_int(t); + if (c < 0) { + if (acc->rest) { + *acc->rest = t; + } + return InteropBadArg; + } + switch (acc->out_encoding) { + case Latin1Encoding: { + if (c > 255) { + if (acc->rest) { + *acc->rest = t; + } + return InteropBadArg; + } + *acc->output++ = (uint8_t) c; + } break; + case UTF8Encoding: { + size_t char_size; + if (UNLIKELY(!bitstring_utf8_encode(c, acc->output, &char_size))) { + if (acc->rest) { + *acc->rest = t; + } + return InteropBadArg; + } + acc->output += char_size; + } break; + case UCS4NativeEncoding: { + *((uint32_t *) acc->output) = c; + acc->output += sizeof(uint32_t); + } break; + } + } + return InteropOk; +} + +static void chardata_to_bytes_rest_fun(term t, void *accum) +{ + struct CharDataToBytesAcc *acc = (struct CharDataToBytesAcc *) accum; + if (!term_is_binary(t) && !term_is_integer(t) && !term_is_list(t)) { + acc->badarg = true; + } + if (!acc->badarg) { + if (!term_is_nil(t)) { + acc->incomplete_transform = false; + } + if (acc->rest) { + *acc->rest = term_list_prepend(*acc->rest, t, acc->heap); + } + } +} + +enum UnicodeConversionResult interop_chardata_to_bytes(term t, uint8_t *output, term *rest, enum CharDataEncoding in_encoding, enum CharDataEncoding out_encoding, Heap *heap) +{ + struct CharDataToBytesAcc acc = { + .in_encoding = in_encoding, + .out_encoding = out_encoding, + .output = output, + .rest = rest, + .heap = heap, + .badarg = false, + .incomplete_transform = false + }; + InteropFunctionResult res = interop_chardata_fold(t, chardata_to_bytes_fold_fun, chardata_to_bytes_rest_fun, &acc); + if (UNLIKELY(res == InteropMemoryAllocFail)) { + return UnicodeMemoryAllocFail; + } + if (acc.badarg) { + return UnicodeBadArg; + } + if (acc.incomplete_transform) { + return UnicodeIncompleteTransform; + } + return res == InteropOk ? UnicodeOk : UnicodeError; } term interop_map_get_value(GlobalContext *glb, term map, term key) diff --git a/src/libAtomVM/interop.h b/src/libAtomVM/interop.h index c1d92b836..1c05ae5ea 100644 --- a/src/libAtomVM/interop.h +++ b/src/libAtomVM/interop.h @@ -35,6 +35,15 @@ typedef enum InteropBadArg } InteropFunctionResult; +enum UnicodeConversionResult +{ + UnicodeOk = InteropOk, + UnicodeMemoryAllocFail = InteropMemoryAllocFail, + UnicodeBadArg = InteropBadArg, + UnicodeError, + UnicodeIncompleteTransform +}; + /** * An idiomatic macro for marking an AtomStringIntPair table entry as a * interop_atom_term_select_int default. @@ -53,7 +62,8 @@ typedef struct int i_val; } AtomStringIntPair; -typedef InteropFunctionResult (*interop_iolist_fold_fun)(term t, void *accum); +typedef InteropFunctionResult (*interop_chardata_fold_fun)(term t, void *accum); +typedef void (*interop_chardata_rest_fun)(term t, void *accum); char *interop_term_to_string(term t, int *ok); char *interop_binary_to_string(term binary); @@ -67,7 +77,17 @@ term interop_map_get_value_default(GlobalContext *glb, term map, term key, term NO_DISCARD InteropFunctionResult interop_iolist_size(term t, size_t *size); NO_DISCARD InteropFunctionResult interop_write_iolist(term t, char *p); -NO_DISCARD InteropFunctionResult interop_iolist_fold(term t, interop_iolist_fold_fun fold_fun, void *accum); +NO_DISCARD InteropFunctionResult interop_chardata_fold(term t, interop_chardata_fold_fun fold_fun, interop_chardata_rest_fun rest_fun, void *accum); + +enum CharDataEncoding +{ + Latin1Encoding, + UTF8Encoding, + UCS4NativeEncoding // Only available for output for characters_to_list +}; + +NO_DISCARD enum UnicodeConversionResult interop_chardata_to_bytes_size(term t, size_t *size, size_t *rest_size, enum CharDataEncoding in_encoding, enum CharDataEncoding out_encoding); +NO_DISCARD enum UnicodeConversionResult interop_chardata_to_bytes(term t, uint8_t *output, term *rest, enum CharDataEncoding in_encoding, enum CharDataEncoding out_encoding, Heap *heap); /** * @brief Finds on a table the first matching atom string. diff --git a/src/libAtomVM/nifs.c b/src/libAtomVM/nifs.c index bf9083407..caa43ed52 100644 --- a/src/libAtomVM/nifs.c +++ b/src/libAtomVM/nifs.c @@ -164,6 +164,8 @@ static term nif_base64_decode_to_string(Context *ctx, int argc, term argv[]); static term nif_code_load_abs(Context *ctx, int argc, term argv[]); static term nif_code_load_binary(Context *ctx, int argc, term argv[]); static term nif_maps_next(Context *ctx, int argc, term argv[]); +static term nif_unicode_characters_to_list(Context *ctx, int argc, term argv[]); +static term nif_unicode_characters_to_binary(Context *ctx, int argc, term argv[]); #define DECLARE_MATH_NIF_FUN(moniker) \ static term nif_math_##moniker(Context *ctx, int argc, term argv[]); @@ -706,6 +708,16 @@ static const struct Nif maps_next_nif = .base.type = NIFFunctionType, .nif_ptr = nif_maps_next }; +static const struct Nif unicode_characters_to_list_nif = +{ + .base.type = NIFFunctionType, + .nif_ptr = nif_unicode_characters_to_list +}; +static const struct Nif unicode_characters_to_binary_nif = +{ + .base.type = NIFFunctionType, + .nif_ptr = nif_unicode_characters_to_binary +}; #define DEFINE_MATH_NIF(moniker) \ static const struct Nif math_##moniker##_nif = \ @@ -4216,6 +4228,118 @@ static term nif_maps_next(Context *ctx, int argc, term argv[]) return ret; } +static term nif_unicode_characters_to_list(Context *ctx, int argc, term argv[]) +{ + enum CharDataEncoding in_encoding = UTF8Encoding; + if (argc == 2) { + if (argv[1] == LATIN1_ATOM) { + in_encoding = Latin1Encoding; + } else if (UNLIKELY((argv[1] != UTF8_ATOM))) { + RAISE_ERROR(BADARG_ATOM); + } + } + size_t size; + size_t rest_size; + enum UnicodeConversionResult conv_result = interop_chardata_to_bytes_size(argv[0], &size, &rest_size, in_encoding, UCS4NativeEncoding); + if (UNLIKELY(conv_result == UnicodeMemoryAllocFail)) { + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } + if (UNLIKELY(conv_result == UnicodeBadArg)) { + RAISE_ERROR(BADARG_ATOM); + } + size_t len = size / sizeof(uint32_t); + uint32_t *chars = malloc(size); + if (IS_NULL_PTR(chars)) { + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } + size_t needed_terms = CONS_SIZE * len; + if (UNLIKELY(conv_result == UnicodeError || conv_result == UnicodeIncompleteTransform)) { + needed_terms += rest_size + TUPLE_SIZE(3); + } + if (UNLIKELY(conv_result == UnicodeBadArg)) { + free(chars); + RAISE_ERROR(BADARG_ATOM); + } + if (UNLIKELY(memory_ensure_free(ctx, needed_terms) != MEMORY_GC_OK)) { + free(chars); + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } + term rest; + conv_result = interop_chardata_to_bytes(argv[0], (uint8_t *) chars, &rest, in_encoding, UCS4NativeEncoding, &ctx->heap); + if (UNLIKELY(conv_result == UnicodeMemoryAllocFail)) { + free(chars); + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } + term result = term_nil(); + uint32_t *crsr = chars + len - 1; + for (size_t index_list = len; index_list > 0; index_list--) { + result = term_list_prepend(term_from_int(*crsr--), result, &ctx->heap); + } + free(chars); + if (LIKELY(conv_result == UnicodeOk)) { + return result; + } + term result_tuple = term_alloc_tuple(3, &ctx->heap); + term_put_tuple_element(result_tuple, 0, conv_result == UnicodeError ? ERROR_ATOM : INCOMPLETE_ATOM); + term_put_tuple_element(result_tuple, 1, result); + term_put_tuple_element(result_tuple, 2, rest); + return result_tuple; +} + +static term nif_unicode_characters_to_binary(Context *ctx, int argc, term argv[]) +{ + enum CharDataEncoding in_encoding = UTF8Encoding; + enum CharDataEncoding out_encoding = UTF8Encoding; + if (argc > 1) { + if (argv[1] == LATIN1_ATOM) { + in_encoding = Latin1Encoding; + } else if (UNLIKELY((argv[1] != UTF8_ATOM))) { + RAISE_ERROR(BADARG_ATOM); + } + if (argc == 3) { + if (argv[2] == LATIN1_ATOM) { + out_encoding = Latin1Encoding; + } else if (UNLIKELY((argv[2] != UTF8_ATOM))) { + RAISE_ERROR(BADARG_ATOM); + } + } + } + size_t len; + size_t rest_size; + enum UnicodeConversionResult conv_result = interop_chardata_to_bytes_size(argv[0], &len, &rest_size, in_encoding, out_encoding); + if (UNLIKELY(conv_result == UnicodeMemoryAllocFail)) { + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } + if (UNLIKELY(conv_result == UnicodeBadArg)) { + RAISE_ERROR(BADARG_ATOM); + } + size_t needed_terms = term_binary_data_size_in_terms(len); + if (UNLIKELY(conv_result == UnicodeError || conv_result == UnicodeIncompleteTransform)) { + needed_terms += TUPLE_SIZE(3) + rest_size; + } + if (UNLIKELY(conv_result == UnicodeBadArg)) { + RAISE_ERROR(BADARG_ATOM); + } + if (UNLIKELY(memory_ensure_free(ctx, needed_terms) != MEMORY_GC_OK)) { + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } + term result = term_create_uninitialized_binary(len, &ctx->heap, ctx->global); + uint8_t *binary_data = (uint8_t *) term_binary_data(result); + term rest; + conv_result = interop_chardata_to_bytes(argv[0], binary_data, &rest, in_encoding, out_encoding, &ctx->heap); + if (UNLIKELY(conv_result == UnicodeMemoryAllocFail)) { + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } + if (LIKELY(conv_result == UnicodeOk)) { + return result; + } + term result_tuple = term_alloc_tuple(3, &ctx->heap); + term_put_tuple_element(result_tuple, 0, conv_result == UnicodeError ? ERROR_ATOM : INCOMPLETE_ATOM); + term_put_tuple_element(result_tuple, 1, result); + term_put_tuple_element(result_tuple, 2, rest); + return result_tuple; +} + // // MAINTENANCE NOTE: Exception handling for fp operations using math // error handling is designed to be thread-safe, as errors are specified diff --git a/src/libAtomVM/nifs.gperf b/src/libAtomVM/nifs.gperf index 610af249e..65f7bbaf4 100644 --- a/src/libAtomVM/nifs.gperf +++ b/src/libAtomVM/nifs.gperf @@ -142,6 +142,11 @@ base64:decode/1, &base64_decode_nif base64:encode_to_string/1, &base64_encode_to_string_nif base64:decode_to_string/1, &base64_decode_to_string_nif maps:next/1, &maps_next_nif +unicode:characters_to_list/1, &unicode_characters_to_list_nif +unicode:characters_to_list/2, &unicode_characters_to_list_nif +unicode:characters_to_binary/1, &unicode_characters_to_binary_nif +unicode:characters_to_binary/2, &unicode_characters_to_binary_nif +unicode:characters_to_binary/3, &unicode_characters_to_binary_nif math:acos/1, &math_acos_nif math:acosh/1, &math_acosh_nif math:asin/1, &math_asin_nif diff --git a/src/libAtomVM/opcodesswitch.h b/src/libAtomVM/opcodesswitch.h index e2640d9e6..6707320df 100644 --- a/src/libAtomVM/opcodesswitch.h +++ b/src/libAtomVM/opcodesswitch.h @@ -3954,7 +3954,7 @@ HOT_FUNC int scheduler_entry_point(GlobalContext *glb) term src_bin = term_get_match_state_binary(src); avm_int_t offset_bits = term_get_match_state_offset(src); - int32_t val = 0; + uint32_t val = 0; size_t out_size = 0; bool is_valid = bitstring_match_utf8(src_bin, (size_t) offset_bits, &val, &out_size); @@ -3994,7 +3994,7 @@ HOT_FUNC int scheduler_entry_point(GlobalContext *glb) term src_bin = term_get_match_state_binary(src); avm_int_t offset_bits = term_get_match_state_offset(src); - int32_t c = 0; + uint32_t c = 0; size_t out_size = 0; bool is_valid = bitstring_match_utf8(src_bin, (size_t) offset_bits, &c, &out_size); diff --git a/tests/erlang_tests/CMakeLists.txt b/tests/erlang_tests/CMakeLists.txt index 50dafb53d..54f5bfc93 100644 --- a/tests/erlang_tests/CMakeLists.txt +++ b/tests/erlang_tests/CMakeLists.txt @@ -265,6 +265,7 @@ compile_erlang(test_integer_to_binary) compile_erlang(test_list_to_binary) compile_erlang(test_binary_to_list) compile_erlang(test_atom_to_binary) +compile_erlang(test_unicode) compile_erlang(test_binary_part) compile_erlang(test_binary_split) @@ -702,6 +703,7 @@ add_custom_target(erlang_test_modules DEPENDS test_list_to_binary.beam test_binary_to_list.beam test_atom_to_binary.beam + test_unicode.beam test_binary_part.beam test_binary_split.beam diff --git a/tests/erlang_tests/test_unicode.erl b/tests/erlang_tests/test_unicode.erl new file mode 100644 index 000000000..b7a12dc1f --- /dev/null +++ b/tests/erlang_tests/test_unicode.erl @@ -0,0 +1,147 @@ +% +% This file is part of AtomVM. +% +% Copyright 2023 Paul Guyot +% +% Licensed under the Apache License, Version 2.0 (the "License"); +% you may not use this file except in compliance with the License. +% You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +% See the License for the specific language governing permissions and +% limitations under the License. +% +% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later +% + +-module(test_unicode). + +-export([start/0]). + +start() -> + ok = test_to_list_latin1(), + ok = test_to_list_utf8(), + ok = test_to_binary_latin1(), + ok = test_to_binary_utf8(), + 0. + +test_to_list_latin1() -> + "hello" = unicode:characters_to_list(<<"hello">>, latin1), + "hello" = unicode:characters_to_list("hello", latin1), + "hé" = unicode:characters_to_list(<<"hé">>, latin1), + "hé" = unicode:characters_to_list(<<"hé">>, latin1), + {error, "h", [-1]} = unicode:characters_to_list([$h, -1], latin1), + {error, "h", [-1 | "ello"]} = unicode:characters_to_list([$h, -1 | "ello"], latin1), + {error, "fooh", [[-1 | "ello"], "bar"]} = unicode:characters_to_list( + ["foo", [$h, -1 | "ello"], "bar"], latin1 + ), + ok = + try + unicode:characters_to_list([$h, self()], latin1), + fail + catch + error:badarg -> ok + end, + ok. + +test_to_list_utf8() -> + "hello" = unicode:characters_to_list(<<"hello">>), + "hello" = unicode:characters_to_list("hello", utf8), + "hé" = unicode:characters_to_list(<<"hé"/utf8>>), + "hé" = unicode:characters_to_list("hé"), + "hé" = unicode:characters_to_list(<<"hé"/utf8>>, utf8), + {error, "h", [-1]} = unicode:characters_to_list([$h, -1], utf8), + {incomplete, "h", <<"é">>} = unicode:characters_to_list(<<"hé">>), + {error, [], <<16#A0, 16#A1>>} = unicode:characters_to_list(<<16#A0, 16#A1>>), + % Erlang/OTP documentation writes: "The last part is mostly for debugging" + % BEAM and ATOM representation differ a little bit + Expected1 = + case erlang:system_info(machine) of + "BEAM" -> [<<"é">>, [[["bar"]]]]; + "ATOM" -> [[<<"é">>, ["bar"]]] + end, + {error, "fooh", Expected1} = unicode:characters_to_list(["foo", [<<"hé">>, ["bar"]]]), + Expected2 = + case erlang:system_info(machine) of + "BEAM" -> [<<"é">>, [[["bar"], "foobar"]]]; + "ATOM" -> [[<<"é">>, ["bar"], "foobar"]] + end, + {error, "fooh", Expected2} = unicode:characters_to_list(["foo", [<<"hé">>, ["bar"], "foobar"]]), + ok. + +test_to_binary_latin1() -> + <<"hello">> = unicode:characters_to_binary("hello", latin1, latin1), + <<"hello">> = unicode:characters_to_binary(<<"hello">>, latin1, latin1), + <<"hé">> = unicode:characters_to_binary("hé", latin1, latin1), + <<"hé">> = unicode:characters_to_binary(<<"hé"/utf8>>, utf8, latin1), + % For some reason, Erlang/OTP fails on -1 for latin1 + ok = + case erlang:system_info(machine) of + "BEAM" -> + try + unicode:characters_to_binary([$h, -1], latin1, latin1) + catch + error:badarg -> ok + end; + "ATOM" -> + {error, <<"h">>, [-1]} = unicode:characters_to_binary([$h, -1], latin1, latin1), + ok + end, + Expected1 = + case erlang:system_info(machine) of + "BEAM" -> [[2000]]; + "ATOM" -> [2000] + end, + {error, <<"h">>, Expected1} = unicode:characters_to_binary([$h, 2000], latin1, latin1), + Expected2 = + case erlang:system_info(machine) of + "BEAM" -> [[2000] | "ello"]; + "ATOM" -> [2000 | "ello"] + end, + {error, <<"h">>, Expected2} = unicode:characters_to_binary([$h, 2000 | "ello"], latin1, latin1), + Expected3 = + case erlang:system_info(machine) of + "BEAM" -> [[[2000] | "ello"], "bar"]; + "ATOM" -> [[2000 | "ello"], "bar"] + end, + {error, <<"fooh">>, Expected3} = unicode:characters_to_binary( + ["foo", [$h, 2000 | "ello"], "bar"], latin1, latin1 + ), + ok = + try + unicode:characters_to_binary([$h, self()], latin1, latin1), + fail + catch + error:badarg -> ok + end, + ok. + +test_to_binary_utf8() -> + <<"hello">> = unicode:characters_to_binary("hello", utf8, utf8), + <<"hello">> = unicode:characters_to_binary(<<"hello">>, utf8, utf8), + <<"hé"/utf8>> = unicode:characters_to_binary("hé", latin1, utf8), + <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>, utf8, utf8), + <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>, utf8), + <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>), + {error, <<"h">>, [-1]} = unicode:characters_to_binary([$h, -1]), + {incomplete, <<"h">>, <<"é">>} = unicode:characters_to_binary(<<"hé">>), + {error, <<>>, <<16#A0, 16#A1>>} = unicode:characters_to_binary(<<16#A0, 16#A1>>), + Expected1 = + case erlang:system_info(machine) of + "BEAM" -> [<<"é">>, [[["bar"]]]]; + "ATOM" -> [[<<"é">>, ["bar"]]] + end, + {error, <<"fooh">>, Expected1} = unicode:characters_to_binary(["foo", [<<"hé">>, ["bar"]]]), + Expected2 = + case erlang:system_info(machine) of + "BEAM" -> [<<"é">>, [[["bar"], "foobar"]]]; + "ATOM" -> [[<<"é">>, ["bar"], "foobar"]] + end, + {error, <<"fooh">>, Expected2} = unicode:characters_to_binary([ + "foo", [<<"hé">>, ["bar"], "foobar"] + ]), + ok. diff --git a/tests/libs/estdlib/test_io_lib.erl b/tests/libs/estdlib/test_io_lib.erl index d2521327f..30afdc00a 100644 --- a/tests/libs/estdlib/test_io_lib.erl +++ b/tests/libs/estdlib/test_io_lib.erl @@ -146,8 +146,8 @@ test() -> % ?ASSERT_MATCH(?FLT(io_lib:format("~-3p", ["foobar"])), "foo"), ?ASSERT_MATCH(?FLT(io_lib:format("~3s", ["foobar"])), "foo"), ?ASSERT_MATCH(?FLT(io_lib:format("~s", [<<"hé"/utf8>>])), [104, 195, 169]), - % ?ASSERT_MATCH(?FLT(io_lib:format("~ts", [<<"hé"/utf8>>])), [104,233]), - % ?ASSERT_MATCH(?FLT(io_lib:format("~ts", [<<"hé"/utf8, 223>>])), [104,195,169,223]), + ?ASSERT_MATCH(?FLT(io_lib:format("~ts", [<<"hé"/utf8>>])), [104, 233]), + ?ASSERT_MATCH(?FLT(io_lib:format("~ts", [<<"hé"/utf8, 223>>])), [104, 195, 169, 223]), ?ASSERT_MATCH(?FLT(io_lib:format("~-3s", ["foobar"])), "foo"), ?ASSERT_MATCH(?FLT(io_lib:format("~3s", ["foo"])), "foo"), ?ASSERT_MATCH(?FLT(io_lib:format("~-3s", ["foo"])), "foo"), diff --git a/tests/test.c b/tests/test.c index caf1fbd04..c170a6a5a 100644 --- a/tests/test.c +++ b/tests/test.c @@ -281,6 +281,7 @@ struct Test tests[] = { TEST_CASE(test_list_to_binary), TEST_CASE_EXPECTED(test_binary_to_list, 0), TEST_CASE_EXPECTED(test_atom_to_binary, 1), + TEST_CASE(test_unicode), TEST_CASE_EXPECTED(test_binary_part, 12), TEST_CASE_EXPECTED(test_binary_split, 16),