Merge pull request #735 from pguyot/w31/add-missing-unicode-module

Implement `unicode:characters_to_list/1,2` and `unicode:characters_to_binary/1,2,3` using new `interop_iolist_fold`. This fixes `io_lib:format/2` with `t` modifier. Rename `interop_iolist_fold` to `interop_chardata_fold` because it really processes `iodata` and now `chardata` as it works for unicode. These changes are made under both the "Apache 2.0" and the "GNU Lesser General Public License 2.1 or later" license terms (dual license). SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
atomvm · Aug 11, 2023 · eb2a5a1 · eb2a5a1
2 parents ce0be6c + ca8d8de
commit eb2a5a1
Show file tree

Hide file tree

Showing 17 changed files with 789 additions and 35 deletions.
diff --git a/.clang-format-ignore b/.clang-format-ignore
@@ -7,7 +7,6 @@
 # We have a number of existing files that are quite "re-format unfriendly"
 # Let's ignore all of them
 src/libAtomVM/bif.c
-src/libAtomVM/bitstring.c
 src/libAtomVM/bitstring.h
 src/libAtomVM/debug.h
 src/libAtomVM/defaultatoms.c

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,7 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added support for Erlang `gpio:close/1` and Elixir `GPIO.close/1` for ESP32
 - Added support for the Erlang `gen_event` module
 - Added `start_link` support for the `network` module
-- Added support for `erlang:monotomic_time/1`
+- Added support for `erlang:monotonic_time/1`
 - Added `start_link` support for the `gen_statem` module
 - Added support for serializing floats in erlang external term encoding
 - Added support for the `SMALL_BIG_EXT` erlang external term encoding
@@ -56,6 +56,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `esp:partition_list/0` function
 - Added `esp:nvs_fetch_binary/2` and `nvs_put_binary/3` functions (`esp:nvs_set_binary` and
 functions that default to `?ATOMVM_NVS_NS` are deprecated now).
+- Added most format possibilities to `io:format/2` and `io_lib:format/2`
+- Added `unicode` module with `characters_to_list/1,2` and `characters_to_binary/1,2,3` functions
 
 ### Fixed
 - Fixed issue with formatting integers with io:format() on STM32 platform

diff --git a/libs/estdlib/src/CMakeLists.txt b/libs/estdlib/src/CMakeLists.txt
@@ -42,6 +42,7 @@ set(ERLANG_MODULES
     proplists
     string
     timer
+    unicode
     erlang
 )
 

diff --git a/libs/estdlib/src/unicode.erl b/libs/estdlib/src/unicode.erl
@@ -0,0 +1,129 @@
+%
+% This file is part of AtomVM.
+%
+% Copyright 2023 Paul Guyot <[email protected]>
+%
+% Licensed under the Apache License, Version 2.0 (the "License");
+% you may not use this file except in compliance with the License.
+% You may obtain a copy of the License at
+%
+%    http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS,
+% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+% See the License for the specific language governing permissions and
+% limitations under the License.
+%
+% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
+%
+
+%%-----------------------------------------------------------------------------
+%% @doc An implementation of the Erlang/OTP unicode interface.
+%%
+%% This module implements a strict subset of the Erlang/OTP unicode
+%% interface.
+%% @end
+%%-----------------------------------------------------------------------------
+-module(unicode).
+
+-export([
+    characters_to_list/1,
+    characters_to_list/2,
+    characters_to_binary/1,
+    characters_to_binary/2,
+    characters_to_binary/3
+]).
+
+%% A UTF-8 encoded binary.
+-type unicode_binary() :: binary().
+
+%% Latin-1 encoded data
+-type latin1_chardata() :: iodata().
+
+%% Unicode or UTF-8 encoded data
+-type chardata() :: charlist() | unicode_binary().
+-type charlist() :: maybe_improper_list(
+    char() | unicode_binary() | charlist(), unicode_binary() | []
+).
+
+-type encoding() :: utf8 | latin1.
+
+-export_type([
+    unicode_binary/0,
+    latin1_chardata/0,
+    chardata/0,
+    charlist/0,
+    encoding/0
+]).
+
+%% @doc Convert UTF-8 data to a list of Unicode characters.
+%% <p>If conversion fails, the function returns a tuple with three elements:</p>
+%% <ul>
+%%     <li>First element is <code>error</code> or <code>incomplete</code>. <code>incomplete</code> means the conversion failed because of an incomplete unicode transform at the very end of data.</li>
+%%     <li>Second element is what has been converted so far.</li>
+%%     <li>Third element is the remaining data to be converted, for debugging purposes. This remaining data can differ with what Erlang/OTP returns.</li>
+%% </ul>
+%% @param Data data to convert to Unicode
+%% @return a list of characters or a tuple if conversion failed.
+-spec characters_to_list(Data :: chardata() | latin1_chardata()) ->
+    list() | {error, list(), chardata() | latin1_chardata()} | {incomplete, list(), binary()}.
+characters_to_list(_Data) ->
+    erlang:nif_error(undefined).
+
+%% @doc Convert UTF-8 or Latin1 data to a list of Unicode characters.
+%% @see characters_to_list/1
+%% @param Data data to convert
+%% @param Encoding encoding of data to convert
+%% @return a list of characters or a tuple if conversion failed.
+-spec characters_to_list(Data :: chardata() | latin1_chardata(), Encoding :: encoding()) ->
+    list()
+    | {error, list(), chardata() | latin1_chardata()}
+    | {incomplete, list(), chardata() | latin1_chardata()}.
+characters_to_list(_Data, _Encoding) ->
+    erlang:nif_error(undefined).
+
+%% @doc Convert character data to an UTF8 binary
+%% @equiv characters_to_binary(Data, utf8, utf8)
+%% @param Data data to convert to UTF8
+%% @return an utf8 binary or a tuple if conversion failed.
+-spec characters_to_binary(Data :: chardata() | latin1_chardata()) ->
+    unicode_binary()
+    | {error, list(), chardata() | latin1_chardata()}
+    | {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
+characters_to_binary(_Data) ->
+    erlang:nif_error(undefined).
+
+%% @doc Convert character data in a given encoding to an UTF8 binary
+%% @equiv characters_to_binary(Data, InEncoding, utf8)
+%% @param Data data to convert to UTF8
+%% @param InEncoding encoding of data
+%% @return an utf8 binary or a tuple if conversion failed.
+-spec characters_to_binary(Data :: chardata() | latin1_chardata(), InEncoding :: encoding()) ->
+    unicode_binary()
+    | {error, list(), chardata() | latin1_chardata()}
+    | {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
+characters_to_binary(_Data, _InEncoding) ->
+    erlang:nif_error(undefined).
+
+%% @doc Convert character data in a given encoding to a binary in a given encoding.
+%% <p>If conversion fails, the function returns a tuple with three elements:</p>
+%% <ul>
+%%     <li>First element is <code>error</code> or <code>incomplete</code>. <code>incomplete</code> means the conversion failed because of an incomplete unicode transform at the very end of data.</li>
+%%     <li>Second element is what has been converted so far.</li>
+%%     <li>Third element is the remaining data to be converted, for debugging purposes. This remaining data can differ with what Erlang/OTP returns.</li>
+%% </ul>
+%% <p>Also, Erlang/OTP's implementation may error with <code>badarg</code> for parameters
+%% for which this function merely returns an error tuple.</p>
+%% @param Data data to convert to UTF8
+%% @param InEncoding encoding of input data
+%% @param InEncoding output encoding
+%% @return an encoded binary or a tuple if conversion failed.
+-spec characters_to_binary(
+    Data :: chardata() | latin1_chardata(), InEncoding :: encoding(), OutEncoding :: encoding()
+) ->
+    unicode_binary()
+    | {error, list(), chardata() | latin1_chardata()}
+    | {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
+characters_to_binary(_Data, _InEncoding, _OutEncoding) ->
+    erlang:nif_error(undefined).
diff --git a/src/libAtomVM/bitstring.c b/src/libAtomVM/bitstring.c
@@ -23,9 +23,7 @@
 
 static inline uint64_t from_le64(uint64_t value)
 {
-    return ((((value) & 0xFF) << 56) | (((value) & 0xFF00) << 40) | (((value) & 0xFF0000) << 24) | \
-        (((value) & 0xFF000000) << 8) | (((value) & 0xFF00000000) >> 8) | (((value) & 0xFF0000000000) >> 24) |  \
-         (((value) & 0xFF000000000000) >> 40) | (((value) & 0xFF00000000000000) >> 56));
+    return ((((value) &0xFF) << 56) | (((value) &0xFF00) << 40) | (((value) &0xFF0000) << 24) | (((value) &0xFF000000) << 8) | (((value) &0xFF00000000) >> 8) | (((value) &0xFF0000000000) >> 24) | (((value) &0xFF000000000000) >> 40) | (((value) &0xFF00000000000000) >> 56));
 }
 
 bool bitstring_extract_any_integer(const uint8_t *src, size_t offset, avm_int_t n,
@@ -140,12 +138,12 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size)
     return true;
 }
 
-bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size)
+enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
 {
     if (len == 0) {
-        return false;
+        return UnicodeTransformDecodeFail;
     } else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) {
-        int32_t v = 0;
+        uint32_t v = 0;
         v |= (buf[0] & 0x07) << 18;
         v |= (buf[1] & 0x3F) << 12;
         v |= (buf[2] & 0x3F) << 6;
@@ -156,9 +154,9 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
         }
         *c = v;
         *out_size = 4;
-        return true;
+        return UnicodeTransformDecodeSuccess;
     } else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
-        int32_t v = 0;
+        uint32_t v = 0;
         v |= (buf[0] & 0x0F) << 12;
         v |= (buf[1] & 0x3F) << 6;
         v |= (buf[2] & 0x3F);
@@ -168,9 +166,9 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
         }
         *c = v;
         *out_size = 3;
-        return true;
+        return UnicodeTransformDecodeSuccess;
     } else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) {
-        int32_t v = 0;
+        uint32_t v = 0;
         v |= (buf[0] & 0x1F) << 6;
         v |= (buf[1] & 0x3F);
         // overlong encoding
@@ -179,16 +177,28 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
         }
         *c = v;
         *out_size = 2;
-        return true;
+        return UnicodeTransformDecodeSuccess;
     } else if ((*buf & 0x80) == 0) {
-        int32_t v = 0;
+        uint32_t v = 0;
         v |= (buf[0] & 0x7F);
         *c = v;
         *out_size = 1;
-        return true;
+        return UnicodeTransformDecodeSuccess;
+    } else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
+        return UnicodeTransformDecodeIncomplete;
+    } else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) {
+        return UnicodeTransformDecodeIncomplete;
+    } else if (len == 1 && (buf[0] & 0xF8) == 0xF0) {
+        return UnicodeTransformDecodeIncomplete;
+    } else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) {
+        return UnicodeTransformDecodeIncomplete;
+    } else if (len == 1 && (buf[0] & 0xF0) == 0xE0) {
+        return UnicodeTransformDecodeIncomplete;
+    } else if (len == 1 && (buf[0] & 0xE0) == 0xC0) {
+        return UnicodeTransformDecodeIncomplete;
     }
 
-    return false;
+    return UnicodeTransformDecodeFail;
 }
 
 // UTF-16 encoding, when U in U+010000 to U+10FFFF:
@@ -321,7 +331,7 @@ bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum Bit
         v |= (buf[3] & 0xFF) << 24;
         v |= (buf[2] & 0xFF) << 16;
         v |= (buf[1] & 0xFF) << 8;
-        v |=  buf[0] & 0xFF;
+        v |= buf[0] & 0xFF;
         if (is_invalid_codepoint(v)) {
             return false;
         }
@@ -332,7 +342,7 @@ bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum Bit
         v |= (buf[0] & 0xFF) << 24;
         v |= (buf[1] & 0xFF) << 16;
         v |= (buf[2] & 0xFF) << 8;
-        v |=  buf[3] & 0xFF;
+        v |= buf[3] & 0xFF;
         if (is_invalid_codepoint(v)) {
             return false;
         }

diff --git a/src/libAtomVM/bitstring.h b/src/libAtomVM/bitstring.h
@@ -99,6 +99,13 @@ enum BitstringFlags
 #endif
 };
 
+enum UnicodeTransformDecodeResult
+{
+    UnicodeTransformDecodeSuccess,
+    UnicodeTransformDecodeFail,
+    UnicodeTransformDecodeIncomplete
+};
+
 union maybe_unsigned_int8
 {
     uint8_t u;
@@ -320,10 +327,12 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size);
  * @param len the length (in bytes) of the bytes in buf
  * @param c int value to decode to or NULL to only compute the size.
  * @param out_size the size in bytes, on output (if not NULL)
- * @return \c true if decoding was successful, \c false if character starting at buf is not a valid
- * unicode character
+ * @return \c UnicodeTransformDecodeSuccess if decoding was successful,
+ * \c UnicodeTransformDecodeFail if character starting at buf is not a valid
+ * unicode character or \c UnicodeTransformDecodeIncomplete if character
+ * starting at buf is a valid but incomplete transformation
  */
-bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size);
+enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);
 
 /**
  * @brief Encode a character to UTF-16.
@@ -428,11 +437,11 @@ static inline bool bitstring_insert_utf8(term dst_bin, size_t offset, avm_int_t
  * @return \c true if encoding was successful, \c false if src_bin at offset is not a valid
  * unicode character
  */
-static inline bool bitstring_match_utf8(term src_bin, size_t offset, int32_t *c, size_t *out_size)
+static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c, size_t *out_size)
 {
     size_t byte_offset = offset >> 3; // divide by 8
     const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
-    return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size);
+    return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
 }
 
 /**

diff --git a/src/libAtomVM/defaultatoms.c b/src/libAtomVM/defaultatoms.c
@@ -143,6 +143,8 @@ static const char *const attributes_atom = "\xA" "attributes";
 static const char *const compile_atom = "\x7" "compile";
 static const char *const exports_atom = "\x7" "exports";
 
+static const char *const incomplete_atom = "\xA" "incomplete";
+
 void defaultatoms_init(GlobalContext *glb)
 {
     int ok = 1;
@@ -270,6 +272,8 @@ void defaultatoms_init(GlobalContext *glb)
     ok &= globalcontext_insert_atom(glb, compile_atom) == COMPILE_ATOM_INDEX;
     ok &= globalcontext_insert_atom(glb, exports_atom) == EXPORTS_ATOM_INDEX;
 
+    ok &= globalcontext_insert_atom(glb, incomplete_atom) == INCOMPLETE_ATOM_INDEX;
+
     if (!ok) {
         AVM_ABORT();
     }

diff --git a/src/libAtomVM/defaultatoms.h b/src/libAtomVM/defaultatoms.h
@@ -152,7 +152,9 @@ extern "C" {
 #define COMPILE_ATOM_INDEX 97
 #define EXPORTS_ATOM_INDEX 98
 
-#define PLATFORM_ATOMS_BASE_INDEX 99
+#define INCOMPLETE_ATOM_INDEX 99
+
+#define PLATFORM_ATOMS_BASE_INDEX 100
 
 #define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX)
 #define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX)
@@ -279,6 +281,8 @@ extern "C" {
 #define COMPILE_ATOM TERM_FROM_ATOM_INDEX(COMPILE_ATOM_INDEX)
 #define EXPORTS_ATOM TERM_FROM_ATOM_INDEX(EXPORTS_ATOM_INDEX)
 
+#define INCOMPLETE_ATOM TERM_FROM_ATOM_INDEX(INCOMPLETE_ATOM_INDEX)
+
 void defaultatoms_init(GlobalContext *glb);
 
 void platform_defaultatoms_init(GlobalContext *glb);