Skip to content

Commit

Permalink
Merge pull request #735 from pguyot/w31/add-missing-unicode-module
Browse files Browse the repository at this point in the history
Implement `unicode:characters_to_list/1,2` and
`unicode:characters_to_binary/1,2,3` using new `interop_iolist_fold`.

This fixes `io_lib:format/2` with `t` modifier.

Rename `interop_iolist_fold` to `interop_chardata_fold` because it really
processes `iodata` and now `chardata` as it works for unicode.

These changes are made under both the "Apache 2.0" and the "GNU Lesser General
Public License 2.1 or later" license terms (dual license).

SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
  • Loading branch information
bettio committed Aug 11, 2023
2 parents ce0be6c + ca8d8de commit eb2a5a1
Show file tree
Hide file tree
Showing 17 changed files with 789 additions and 35 deletions.
1 change: 0 additions & 1 deletion .clang-format-ignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# We have a number of existing files that are quite "re-format unfriendly"
# Let's ignore all of them
src/libAtomVM/bif.c
src/libAtomVM/bitstring.c
src/libAtomVM/bitstring.h
src/libAtomVM/debug.h
src/libAtomVM/defaultatoms.c
Expand Down
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added support for Erlang `gpio:close/1` and Elixir `GPIO.close/1` for ESP32
- Added support for the Erlang `gen_event` module
- Added `start_link` support for the `network` module
- Added support for `erlang:monotomic_time/1`
- Added support for `erlang:monotonic_time/1`
- Added `start_link` support for the `gen_statem` module
- Added support for serializing floats in erlang external term encoding
- Added support for the `SMALL_BIG_EXT` erlang external term encoding
Expand All @@ -56,6 +56,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added `esp:partition_list/0` function
- Added `esp:nvs_fetch_binary/2` and `nvs_put_binary/3` functions (`esp:nvs_set_binary` and
functions that default to `?ATOMVM_NVS_NS` are deprecated now).
- Added most format possibilities to `io:format/2` and `io_lib:format/2`
- Added `unicode` module with `characters_to_list/1,2` and `characters_to_binary/1,2,3` functions

### Fixed
- Fixed issue with formatting integers with io:format() on STM32 platform
Expand Down
1 change: 1 addition & 0 deletions libs/estdlib/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ set(ERLANG_MODULES
proplists
string
timer
unicode
erlang
)

Expand Down
129 changes: 129 additions & 0 deletions libs/estdlib/src/unicode.erl
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
%
% This file is part of AtomVM.
%
% Copyright 2023 Paul Guyot <[email protected]>
%
% Licensed under the Apache License, Version 2.0 (the "License");
% you may not use this file except in compliance with the License.
% You may obtain a copy of the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS,
% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
% See the License for the specific language governing permissions and
% limitations under the License.
%
% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
%

%%-----------------------------------------------------------------------------
%% @doc An implementation of the Erlang/OTP unicode interface.
%%
%% This module implements a strict subset of the Erlang/OTP unicode
%% interface.
%% @end
%%-----------------------------------------------------------------------------
-module(unicode).

-export([
characters_to_list/1,
characters_to_list/2,
characters_to_binary/1,
characters_to_binary/2,
characters_to_binary/3
]).

%% A UTF-8 encoded binary.
-type unicode_binary() :: binary().

%% Latin-1 encoded data
-type latin1_chardata() :: iodata().

%% Unicode or UTF-8 encoded data
-type chardata() :: charlist() | unicode_binary().
-type charlist() :: maybe_improper_list(
char() | unicode_binary() | charlist(), unicode_binary() | []
).

-type encoding() :: utf8 | latin1.

-export_type([
unicode_binary/0,
latin1_chardata/0,
chardata/0,
charlist/0,
encoding/0
]).

%% @doc Convert UTF-8 data to a list of Unicode characters.
%% <p>If conversion fails, the function returns a tuple with three elements:</p>
%% <ul>
%% <li>First element is <code>error</code> or <code>incomplete</code>. <code>incomplete</code> means the conversion failed because of an incomplete unicode transform at the very end of data.</li>
%% <li>Second element is what has been converted so far.</li>
%% <li>Third element is the remaining data to be converted, for debugging purposes. This remaining data can differ with what Erlang/OTP returns.</li>
%% </ul>
%% @param Data data to convert to Unicode
%% @return a list of characters or a tuple if conversion failed.
-spec characters_to_list(Data :: chardata() | latin1_chardata()) ->
list() | {error, list(), chardata() | latin1_chardata()} | {incomplete, list(), binary()}.
characters_to_list(_Data) ->
erlang:nif_error(undefined).

%% @doc Convert UTF-8 or Latin1 data to a list of Unicode characters.
%% @see characters_to_list/1
%% @param Data data to convert
%% @param Encoding encoding of data to convert
%% @return a list of characters or a tuple if conversion failed.
-spec characters_to_list(Data :: chardata() | latin1_chardata(), Encoding :: encoding()) ->
list()
| {error, list(), chardata() | latin1_chardata()}
| {incomplete, list(), chardata() | latin1_chardata()}.
characters_to_list(_Data, _Encoding) ->
erlang:nif_error(undefined).

%% @doc Convert character data to an UTF8 binary
%% @equiv characters_to_binary(Data, utf8, utf8)
%% @param Data data to convert to UTF8
%% @return an utf8 binary or a tuple if conversion failed.
-spec characters_to_binary(Data :: chardata() | latin1_chardata()) ->
unicode_binary()
| {error, list(), chardata() | latin1_chardata()}
| {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
characters_to_binary(_Data) ->
erlang:nif_error(undefined).

%% @doc Convert character data in a given encoding to an UTF8 binary
%% @equiv characters_to_binary(Data, InEncoding, utf8)
%% @param Data data to convert to UTF8
%% @param InEncoding encoding of data
%% @return an utf8 binary or a tuple if conversion failed.
-spec characters_to_binary(Data :: chardata() | latin1_chardata(), InEncoding :: encoding()) ->
unicode_binary()
| {error, list(), chardata() | latin1_chardata()}
| {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
characters_to_binary(_Data, _InEncoding) ->
erlang:nif_error(undefined).

%% @doc Convert character data in a given encoding to a binary in a given encoding.
%% <p>If conversion fails, the function returns a tuple with three elements:</p>
%% <ul>
%% <li>First element is <code>error</code> or <code>incomplete</code>. <code>incomplete</code> means the conversion failed because of an incomplete unicode transform at the very end of data.</li>
%% <li>Second element is what has been converted so far.</li>
%% <li>Third element is the remaining data to be converted, for debugging purposes. This remaining data can differ with what Erlang/OTP returns.</li>
%% </ul>
%% <p>Also, Erlang/OTP's implementation may error with <code>badarg</code> for parameters
%% for which this function merely returns an error tuple.</p>
%% @param Data data to convert to UTF8
%% @param InEncoding encoding of input data
%% @param InEncoding output encoding
%% @return an encoded binary or a tuple if conversion failed.
-spec characters_to_binary(
Data :: chardata() | latin1_chardata(), InEncoding :: encoding(), OutEncoding :: encoding()
) ->
unicode_binary()
| {error, list(), chardata() | latin1_chardata()}
| {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
characters_to_binary(_Data, _InEncoding, _OutEncoding) ->
erlang:nif_error(undefined).
42 changes: 26 additions & 16 deletions src/libAtomVM/bitstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@

static inline uint64_t from_le64(uint64_t value)
{
return ((((value) & 0xFF) << 56) | (((value) & 0xFF00) << 40) | (((value) & 0xFF0000) << 24) | \
(((value) & 0xFF000000) << 8) | (((value) & 0xFF00000000) >> 8) | (((value) & 0xFF0000000000) >> 24) | \
(((value) & 0xFF000000000000) >> 40) | (((value) & 0xFF00000000000000) >> 56));
return ((((value) &0xFF) << 56) | (((value) &0xFF00) << 40) | (((value) &0xFF0000) << 24) | (((value) &0xFF000000) << 8) | (((value) &0xFF00000000) >> 8) | (((value) &0xFF0000000000) >> 24) | (((value) &0xFF000000000000) >> 40) | (((value) &0xFF00000000000000) >> 56));
}

bool bitstring_extract_any_integer(const uint8_t *src, size_t offset, avm_int_t n,
Expand Down Expand Up @@ -140,12 +138,12 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size)
return true;
}

bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size)
enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
{
if (len == 0) {
return false;
return UnicodeTransformDecodeFail;
} else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) {
int32_t v = 0;
uint32_t v = 0;
v |= (buf[0] & 0x07) << 18;
v |= (buf[1] & 0x3F) << 12;
v |= (buf[2] & 0x3F) << 6;
Expand All @@ -156,9 +154,9 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
}
*c = v;
*out_size = 4;
return true;
return UnicodeTransformDecodeSuccess;
} else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
int32_t v = 0;
uint32_t v = 0;
v |= (buf[0] & 0x0F) << 12;
v |= (buf[1] & 0x3F) << 6;
v |= (buf[2] & 0x3F);
Expand All @@ -168,9 +166,9 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
}
*c = v;
*out_size = 3;
return true;
return UnicodeTransformDecodeSuccess;
} else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) {
int32_t v = 0;
uint32_t v = 0;
v |= (buf[0] & 0x1F) << 6;
v |= (buf[1] & 0x3F);
// overlong encoding
Expand All @@ -179,16 +177,28 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
}
*c = v;
*out_size = 2;
return true;
return UnicodeTransformDecodeSuccess;
} else if ((*buf & 0x80) == 0) {
int32_t v = 0;
uint32_t v = 0;
v |= (buf[0] & 0x7F);
*c = v;
*out_size = 1;
return true;
return UnicodeTransformDecodeSuccess;
} else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 1 && (buf[0] & 0xF8) == 0xF0) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 1 && (buf[0] & 0xF0) == 0xE0) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 1 && (buf[0] & 0xE0) == 0xC0) {
return UnicodeTransformDecodeIncomplete;
}

return false;
return UnicodeTransformDecodeFail;
}

// UTF-16 encoding, when U in U+010000 to U+10FFFF:
Expand Down Expand Up @@ -321,7 +331,7 @@ bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum Bit
v |= (buf[3] & 0xFF) << 24;
v |= (buf[2] & 0xFF) << 16;
v |= (buf[1] & 0xFF) << 8;
v |= buf[0] & 0xFF;
v |= buf[0] & 0xFF;
if (is_invalid_codepoint(v)) {
return false;
}
Expand All @@ -332,7 +342,7 @@ bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum Bit
v |= (buf[0] & 0xFF) << 24;
v |= (buf[1] & 0xFF) << 16;
v |= (buf[2] & 0xFF) << 8;
v |= buf[3] & 0xFF;
v |= buf[3] & 0xFF;
if (is_invalid_codepoint(v)) {
return false;
}
Expand Down
19 changes: 14 additions & 5 deletions src/libAtomVM/bitstring.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ enum BitstringFlags
#endif
};

enum UnicodeTransformDecodeResult
{
UnicodeTransformDecodeSuccess,
UnicodeTransformDecodeFail,
UnicodeTransformDecodeIncomplete
};

union maybe_unsigned_int8
{
uint8_t u;
Expand Down Expand Up @@ -320,10 +327,12 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size);
* @param len the length (in bytes) of the bytes in buf
* @param c int value to decode to or NULL to only compute the size.
* @param out_size the size in bytes, on output (if not NULL)
* @return \c true if decoding was successful, \c false if character starting at buf is not a valid
* unicode character
* @return \c UnicodeTransformDecodeSuccess if decoding was successful,
* \c UnicodeTransformDecodeFail if character starting at buf is not a valid
* unicode character or \c UnicodeTransformDecodeIncomplete if character
* starting at buf is a valid but incomplete transformation
*/
bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size);
enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);

/**
* @brief Encode a character to UTF-16.
Expand Down Expand Up @@ -428,11 +437,11 @@ static inline bool bitstring_insert_utf8(term dst_bin, size_t offset, avm_int_t
* @return \c true if encoding was successful, \c false if src_bin at offset is not a valid
* unicode character
*/
static inline bool bitstring_match_utf8(term src_bin, size_t offset, int32_t *c, size_t *out_size)
static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c, size_t *out_size)
{
size_t byte_offset = offset >> 3; // divide by 8
const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size);
return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
}

/**
Expand Down
4 changes: 4 additions & 0 deletions src/libAtomVM/defaultatoms.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ static const char *const attributes_atom = "\xA" "attributes";
static const char *const compile_atom = "\x7" "compile";
static const char *const exports_atom = "\x7" "exports";

static const char *const incomplete_atom = "\xA" "incomplete";

void defaultatoms_init(GlobalContext *glb)
{
int ok = 1;
Expand Down Expand Up @@ -270,6 +272,8 @@ void defaultatoms_init(GlobalContext *glb)
ok &= globalcontext_insert_atom(glb, compile_atom) == COMPILE_ATOM_INDEX;
ok &= globalcontext_insert_atom(glb, exports_atom) == EXPORTS_ATOM_INDEX;

ok &= globalcontext_insert_atom(glb, incomplete_atom) == INCOMPLETE_ATOM_INDEX;

if (!ok) {
AVM_ABORT();
}
Expand Down
6 changes: 5 additions & 1 deletion src/libAtomVM/defaultatoms.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,9 @@ extern "C" {
#define COMPILE_ATOM_INDEX 97
#define EXPORTS_ATOM_INDEX 98

#define PLATFORM_ATOMS_BASE_INDEX 99
#define INCOMPLETE_ATOM_INDEX 99

#define PLATFORM_ATOMS_BASE_INDEX 100

#define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX)
#define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX)
Expand Down Expand Up @@ -279,6 +281,8 @@ extern "C" {
#define COMPILE_ATOM TERM_FROM_ATOM_INDEX(COMPILE_ATOM_INDEX)
#define EXPORTS_ATOM TERM_FROM_ATOM_INDEX(EXPORTS_ATOM_INDEX)

#define INCOMPLETE_ATOM TERM_FROM_ATOM_INDEX(INCOMPLETE_ATOM_INDEX)

void defaultatoms_init(GlobalContext *glb);

void platform_defaultatoms_init(GlobalContext *glb);
Expand Down
Loading

0 comments on commit eb2a5a1

Please sign in to comment.