From ca8d8deb96d4fa44646f8b44bc99650b5946a48a Mon Sep 17 00:00:00 2001
From: Paul Guyot <pguyot@kallisys.net>
Date: Fri, 11 Aug 2023 07:41:32 +0200
Subject: [PATCH] Add missing unicode module

Implement `unicode:characters_to_list/1,2` and `unicode:characters_to_binary/1,2,3` using new `interop_iolist_fold`.

This fixes `io_lib:format/2` with `t` modifier.

Rename `interop_iolist_fold` to `interop_chardata_fold` because it really processes `iodata` and now `chardata` as it works for unicode.

Signed-off-by: Paul Guyot <pguyot@kallisys.net>
---
 .clang-format-ignore                |   1 -
 CHANGELOG.md                        |   4 +-
 libs/estdlib/src/CMakeLists.txt     |   1 +
 libs/estdlib/src/unicode.erl        | 129 ++++++++++++
 src/libAtomVM/bitstring.c           |  42 ++--
 src/libAtomVM/bitstring.h           |  19 +-
 src/libAtomVM/defaultatoms.c        |   4 +
 src/libAtomVM/defaultatoms.h        |   6 +-
 src/libAtomVM/interop.c             | 307 +++++++++++++++++++++++++++-
 src/libAtomVM/interop.h             |  24 ++-
 src/libAtomVM/nifs.c                | 124 +++++++++++
 src/libAtomVM/nifs.gperf            |   5 +
 src/libAtomVM/opcodesswitch.h       |   4 +-
 tests/erlang_tests/CMakeLists.txt   |   2 +
 tests/erlang_tests/test_unicode.erl | 147 +++++++++++++
 tests/libs/estdlib/test_io_lib.erl  |   4 +-
 tests/test.c                        |   1 +
 17 files changed, 789 insertions(+), 35 deletions(-)
 create mode 100644 libs/estdlib/src/unicode.erl
 create mode 100644 tests/erlang_tests/test_unicode.erl

diff --git a/.clang-format-ignore b/.clang-format-ignore
index 5c07b9fbd..61dbe32ae 100644
--- a/.clang-format-ignore
+++ b/.clang-format-ignore
@@ -7,7 +7,6 @@
 # We have a number of existing files that are quite "re-format unfriendly"
 # Let's ignore all of them
 src/libAtomVM/bif.c
-src/libAtomVM/bitstring.c
 src/libAtomVM/bitstring.h
 src/libAtomVM/debug.h
 src/libAtomVM/defaultatoms.c
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3151349c8..56a6d445f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,7 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added support for Erlang `gpio:close/1` and Elixir `GPIO.close/1` for ESP32
 - Added support for the Erlang `gen_event` module
 - Added `start_link` support for the `network` module
-- Added support for `erlang:monotomic_time/1`
+- Added support for `erlang:monotonic_time/1`
 - Added `start_link` support for the `gen_statem` module
 - Added support for serializing floats in erlang external term encoding
 - Added support for the `SMALL_BIG_EXT` erlang external term encoding
@@ -56,6 +56,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `esp:partition_list/0` function
 - Added `esp:nvs_fetch_binary/2` and `nvs_put_binary/3` functions (`esp:nvs_set_binary` and
 functions that default to `?ATOMVM_NVS_NS` are deprecated now).
+- Added most format possibilities to `io:format/2` and `io_lib:format/2`
+- Added `unicode` module with `characters_to_list/1,2` and `characters_to_binary/1,2,3` functions
 
 ### Fixed
 - Fixed issue with formatting integers with io:format() on STM32 platform
diff --git a/libs/estdlib/src/CMakeLists.txt b/libs/estdlib/src/CMakeLists.txt
index 57f969eb4..211f2e3cc 100644
--- a/libs/estdlib/src/CMakeLists.txt
+++ b/libs/estdlib/src/CMakeLists.txt
@@ -42,6 +42,7 @@ set(ERLANG_MODULES
     proplists
     string
     timer
+    unicode
     erlang
 )
 
diff --git a/libs/estdlib/src/unicode.erl b/libs/estdlib/src/unicode.erl
new file mode 100644
index 000000000..4ca2d939e
--- /dev/null
+++ b/libs/estdlib/src/unicode.erl
@@ -0,0 +1,129 @@
+%
+% This file is part of AtomVM.
+%
+% Copyright 2023 Paul Guyot <pguyot@kallisys.net>
+%
+% Licensed under the Apache License, Version 2.0 (the "License");
+% you may not use this file except in compliance with the License.
+% You may obtain a copy of the License at
+%
+%    http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS,
+% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+% See the License for the specific language governing permissions and
+% limitations under the License.
+%
+% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
+%
+
+%%-----------------------------------------------------------------------------
+%% @doc An implementation of the Erlang/OTP unicode interface.
+%%
+%% This module implements a strict subset of the Erlang/OTP unicode
+%% interface.
+%% @end
+%%-----------------------------------------------------------------------------
+-module(unicode).
+
+-export([
+    characters_to_list/1,
+    characters_to_list/2,
+    characters_to_binary/1,
+    characters_to_binary/2,
+    characters_to_binary/3
+]).
+
+%% A UTF-8 encoded binary.
+-type unicode_binary() :: binary().
+
+%% Latin-1 encoded data
+-type latin1_chardata() :: iodata().
+
+%% Unicode or UTF-8 encoded data
+-type chardata() :: charlist() | unicode_binary().
+-type charlist() :: maybe_improper_list(
+    char() | unicode_binary() | charlist(), unicode_binary() | []
+).
+
+-type encoding() :: utf8 | latin1.
+
+-export_type([
+    unicode_binary/0,
+    latin1_chardata/0,
+    chardata/0,
+    charlist/0,
+    encoding/0
+]).
+
+%% @doc Convert UTF-8 data to a list of Unicode characters.
+%% <p>If conversion fails, the function returns a tuple with three elements:</p>
+%% <ul>
+%%     <li>First element is <code>error</code> or <code>incomplete</code>. <code>incomplete</code> means the conversion failed because of an incomplete unicode transform at the very end of data.</li>
+%%     <li>Second element is what has been converted so far.</li>
+%%     <li>Third element is the remaining data to be converted, for debugging purposes. This remaining data can differ with what Erlang/OTP returns.</li>
+%% </ul>
+%% @param Data data to convert to Unicode
+%% @return a list of characters or a tuple if conversion failed.
+-spec characters_to_list(Data :: chardata() | latin1_chardata()) ->
+    list() | {error, list(), chardata() | latin1_chardata()} | {incomplete, list(), binary()}.
+characters_to_list(_Data) ->
+    erlang:nif_error(undefined).
+
+%% @doc Convert UTF-8 or Latin1 data to a list of Unicode characters.
+%% @see characters_to_list/1
+%% @param Data data to convert
+%% @param Encoding encoding of data to convert
+%% @return a list of characters or a tuple if conversion failed.
+-spec characters_to_list(Data :: chardata() | latin1_chardata(), Encoding :: encoding()) ->
+    list()
+    | {error, list(), chardata() | latin1_chardata()}
+    | {incomplete, list(), chardata() | latin1_chardata()}.
+characters_to_list(_Data, _Encoding) ->
+    erlang:nif_error(undefined).
+
+%% @doc Convert character data to an UTF8 binary
+%% @equiv characters_to_binary(Data, utf8, utf8)
+%% @param Data data to convert to UTF8
+%% @return an utf8 binary or a tuple if conversion failed.
+-spec characters_to_binary(Data :: chardata() | latin1_chardata()) ->
+    unicode_binary()
+    | {error, list(), chardata() | latin1_chardata()}
+    | {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
+characters_to_binary(_Data) ->
+    erlang:nif_error(undefined).
+
+%% @doc Convert character data in a given encoding to an UTF8 binary
+%% @equiv characters_to_binary(Data, InEncoding, utf8)
+%% @param Data data to convert to UTF8
+%% @param InEncoding encoding of data
+%% @return an utf8 binary or a tuple if conversion failed.
+-spec characters_to_binary(Data :: chardata() | latin1_chardata(), InEncoding :: encoding()) ->
+    unicode_binary()
+    | {error, list(), chardata() | latin1_chardata()}
+    | {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
+characters_to_binary(_Data, _InEncoding) ->
+    erlang:nif_error(undefined).
+
+%% @doc Convert character data in a given encoding to a binary in a given encoding.
+%% <p>If conversion fails, the function returns a tuple with three elements:</p>
+%% <ul>
+%%     <li>First element is <code>error</code> or <code>incomplete</code>. <code>incomplete</code> means the conversion failed because of an incomplete unicode transform at the very end of data.</li>
+%%     <li>Second element is what has been converted so far.</li>
+%%     <li>Third element is the remaining data to be converted, for debugging purposes. This remaining data can differ with what Erlang/OTP returns.</li>
+%% </ul>
+%% <p>Also, Erlang/OTP's implementation may error with <code>badarg</code> for parameters
+%% for which this function merely returns an error tuple.</p>
+%% @param Data data to convert to UTF8
+%% @param InEncoding encoding of input data
+%% @param InEncoding output encoding
+%% @return an encoded binary or a tuple if conversion failed.
+-spec characters_to_binary(
+    Data :: chardata() | latin1_chardata(), InEncoding :: encoding(), OutEncoding :: encoding()
+) ->
+    unicode_binary()
+    | {error, list(), chardata() | latin1_chardata()}
+    | {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
+characters_to_binary(_Data, _InEncoding, _OutEncoding) ->
+    erlang:nif_error(undefined).
diff --git a/src/libAtomVM/bitstring.c b/src/libAtomVM/bitstring.c
index b5beb4f3d..307862848 100644
--- a/src/libAtomVM/bitstring.c
+++ b/src/libAtomVM/bitstring.c
@@ -23,9 +23,7 @@
 
 static inline uint64_t from_le64(uint64_t value)
 {
-    return ((((value) & 0xFF) << 56) | (((value) & 0xFF00) << 40) | (((value) & 0xFF0000) << 24) | \
-        (((value) & 0xFF000000) << 8) | (((value) & 0xFF00000000) >> 8) | (((value) & 0xFF0000000000) >> 24) |  \
-         (((value) & 0xFF000000000000) >> 40) | (((value) & 0xFF00000000000000) >> 56));
+    return ((((value) &0xFF) << 56) | (((value) &0xFF00) << 40) | (((value) &0xFF0000) << 24) | (((value) &0xFF000000) << 8) | (((value) &0xFF00000000) >> 8) | (((value) &0xFF0000000000) >> 24) | (((value) &0xFF000000000000) >> 40) | (((value) &0xFF00000000000000) >> 56));
 }
 
 bool bitstring_extract_any_integer(const uint8_t *src, size_t offset, avm_int_t n,
@@ -140,12 +138,12 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size)
     return true;
 }
 
-bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size)
+enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
 {
     if (len == 0) {
-        return false;
+        return UnicodeTransformDecodeFail;
     } else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) {
-        int32_t v = 0;
+        uint32_t v = 0;
         v |= (buf[0] & 0x07) << 18;
         v |= (buf[1] & 0x3F) << 12;
         v |= (buf[2] & 0x3F) << 6;
@@ -156,9 +154,9 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
         }
         *c = v;
         *out_size = 4;
-        return true;
+        return UnicodeTransformDecodeSuccess;
     } else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
-        int32_t v = 0;
+        uint32_t v = 0;
         v |= (buf[0] & 0x0F) << 12;
         v |= (buf[1] & 0x3F) << 6;
         v |= (buf[2] & 0x3F);
@@ -168,9 +166,9 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
         }
         *c = v;
         *out_size = 3;
-        return true;
+        return UnicodeTransformDecodeSuccess;
     } else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) {
-        int32_t v = 0;
+        uint32_t v = 0;
         v |= (buf[0] & 0x1F) << 6;
         v |= (buf[1] & 0x3F);
         // overlong encoding
@@ -179,16 +177,28 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
         }
         *c = v;
         *out_size = 2;
-        return true;
+        return UnicodeTransformDecodeSuccess;
     } else if ((*buf & 0x80) == 0) {
-        int32_t v = 0;
+        uint32_t v = 0;
         v |= (buf[0] & 0x7F);
         *c = v;
         *out_size = 1;
-        return true;
+        return UnicodeTransformDecodeSuccess;
+    } else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
+        return UnicodeTransformDecodeIncomplete;
+    } else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) {
+        return UnicodeTransformDecodeIncomplete;
+    } else if (len == 1 && (buf[0] & 0xF8) == 0xF0) {
+        return UnicodeTransformDecodeIncomplete;
+    } else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) {
+        return UnicodeTransformDecodeIncomplete;
+    } else if (len == 1 && (buf[0] & 0xF0) == 0xE0) {
+        return UnicodeTransformDecodeIncomplete;
+    } else if (len == 1 && (buf[0] & 0xE0) == 0xC0) {
+        return UnicodeTransformDecodeIncomplete;
     }
 
-    return false;
+    return UnicodeTransformDecodeFail;
 }
 
 // UTF-16 encoding, when U in U+010000 to U+10FFFF:
@@ -321,7 +331,7 @@ bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum Bit
         v |= (buf[3] & 0xFF) << 24;
         v |= (buf[2] & 0xFF) << 16;
         v |= (buf[1] & 0xFF) << 8;
-        v |=  buf[0] & 0xFF;
+        v |= buf[0] & 0xFF;
         if (is_invalid_codepoint(v)) {
             return false;
         }
@@ -332,7 +342,7 @@ bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum Bit
         v |= (buf[0] & 0xFF) << 24;
         v |= (buf[1] & 0xFF) << 16;
         v |= (buf[2] & 0xFF) << 8;
-        v |=  buf[3] & 0xFF;
+        v |= buf[3] & 0xFF;
         if (is_invalid_codepoint(v)) {
             return false;
         }
diff --git a/src/libAtomVM/bitstring.h b/src/libAtomVM/bitstring.h
index 60bb7baeb..385c4e343 100644
--- a/src/libAtomVM/bitstring.h
+++ b/src/libAtomVM/bitstring.h
@@ -99,6 +99,13 @@ enum BitstringFlags
 #endif
 };
 
+enum UnicodeTransformDecodeResult
+{
+    UnicodeTransformDecodeSuccess,
+    UnicodeTransformDecodeFail,
+    UnicodeTransformDecodeIncomplete
+};
+
 union maybe_unsigned_int8
 {
     uint8_t u;
@@ -320,10 +327,12 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size);
  * @param len the length (in bytes) of the bytes in buf
  * @param c int value to decode to or NULL to only compute the size.
  * @param out_size the size in bytes, on output (if not NULL)
- * @return \c true if decoding was successful, \c false if character starting at buf is not a valid
- * unicode character
+ * @return \c UnicodeTransformDecodeSuccess if decoding was successful,
+ * \c UnicodeTransformDecodeFail if character starting at buf is not a valid
+ * unicode character or \c UnicodeTransformDecodeIncomplete if character
+ * starting at buf is a valid but incomplete transformation
  */
-bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size);
+enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);
 
 /**
  * @brief Encode a character to UTF-16.
@@ -428,11 +437,11 @@ static inline bool bitstring_insert_utf8(term dst_bin, size_t offset, avm_int_t
  * @return \c true if encoding was successful, \c false if src_bin at offset is not a valid
  * unicode character
  */
-static inline bool bitstring_match_utf8(term src_bin, size_t offset, int32_t *c, size_t *out_size)
+static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c, size_t *out_size)
 {
     size_t byte_offset = offset >> 3; // divide by 8
     const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
-    return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size);
+    return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
 }
 
 /**
diff --git a/src/libAtomVM/defaultatoms.c b/src/libAtomVM/defaultatoms.c
index d07df7a0a..5981644b8 100644
--- a/src/libAtomVM/defaultatoms.c
+++ b/src/libAtomVM/defaultatoms.c
@@ -143,6 +143,8 @@ static const char *const attributes_atom = "\xA" "attributes";
 static const char *const compile_atom = "\x7" "compile";
 static const char *const exports_atom = "\x7" "exports";
 
+static const char *const incomplete_atom = "\xA" "incomplete";
+
 void defaultatoms_init(GlobalContext *glb)
 {
     int ok = 1;
@@ -270,6 +272,8 @@ void defaultatoms_init(GlobalContext *glb)
     ok &= globalcontext_insert_atom(glb, compile_atom) == COMPILE_ATOM_INDEX;
     ok &= globalcontext_insert_atom(glb, exports_atom) == EXPORTS_ATOM_INDEX;
 
+    ok &= globalcontext_insert_atom(glb, incomplete_atom) == INCOMPLETE_ATOM_INDEX;
+
     if (!ok) {
         AVM_ABORT();
     }
diff --git a/src/libAtomVM/defaultatoms.h b/src/libAtomVM/defaultatoms.h
index a30f81624..1bf6a1198 100644
--- a/src/libAtomVM/defaultatoms.h
+++ b/src/libAtomVM/defaultatoms.h
@@ -152,7 +152,9 @@ extern "C" {
 #define COMPILE_ATOM_INDEX 97
 #define EXPORTS_ATOM_INDEX 98
 
-#define PLATFORM_ATOMS_BASE_INDEX 99
+#define INCOMPLETE_ATOM_INDEX 99
+
+#define PLATFORM_ATOMS_BASE_INDEX 100
 
 #define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX)
 #define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX)
@@ -279,6 +281,8 @@ extern "C" {
 #define COMPILE_ATOM TERM_FROM_ATOM_INDEX(COMPILE_ATOM_INDEX)
 #define EXPORTS_ATOM TERM_FROM_ATOM_INDEX(EXPORTS_ATOM_INDEX)
 
+#define INCOMPLETE_ATOM TERM_FROM_ATOM_INDEX(INCOMPLETE_ATOM_INDEX)
+
 void defaultatoms_init(GlobalContext *glb);
 
 void platform_defaultatoms_init(GlobalContext *glb);
diff --git a/src/libAtomVM/interop.c b/src/libAtomVM/interop.c
index b04b00281..f75261b44 100644
--- a/src/libAtomVM/interop.c
+++ b/src/libAtomVM/interop.c
@@ -20,9 +20,13 @@
 
 #include "interop.h"
 
+#include "bitstring.h"
 #include "defaultatoms.h"
 #include "tempstack.h"
+#include "term.h"
+#include "term_typedef.h"
 #include "valueshashtable.h"
+#include <stdint.h>
 
 char *interop_term_to_string(term t, int *ok)
 {
@@ -176,7 +180,7 @@ term interop_proplist_get_value_default(term list, term key, term default_value)
     return default_value;
 }
 
-inline InteropFunctionResult interop_iolist_fold(term t, interop_iolist_fold_fun fold_fun, void *accum)
+inline InteropFunctionResult interop_chardata_fold(term t, interop_chardata_fold_fun fold_fun, interop_chardata_rest_fun rest_fun, void *accum)
 {
     if (term_is_binary(t)) {
         return fold_fun(t, accum);
@@ -200,6 +204,15 @@ inline InteropFunctionResult interop_iolist_fold(term t, interop_iolist_fold_fun
         if (term_is_integer(t) || term_is_binary(t)) {
             InteropFunctionResult result = fold_fun(t, accum);
             if (UNLIKELY(result != InteropOk)) {
+                if (rest_fun) {
+                    // we don't pass failed element, fold_fun handles it
+                    t = temp_stack_pop(&temp_stack);
+                    while (!temp_stack_is_empty(&temp_stack)) {
+                        rest_fun(t, accum);
+                        t = temp_stack_pop(&temp_stack);
+                    }
+                }
+                // we don't process last element either which is the original list
                 temp_stack_destroy(&temp_stack);
                 return result;
             } else {
@@ -217,6 +230,13 @@ inline InteropFunctionResult interop_iolist_fold(term t, interop_iolist_fold_fun
             t = term_get_list_head(t);
 
         } else {
+            if (rest_fun) {
+                while (!temp_stack_is_empty(&temp_stack)) {
+                    rest_fun(t, accum);
+                    t = temp_stack_pop(&temp_stack);
+                }
+                // we don't process last element which was the passed term
+            }
             temp_stack_destroy(&temp_stack);
             return InteropBadArg;
         }
@@ -232,7 +252,7 @@ static inline InteropFunctionResult size_fold_fun(term t, void *accum)
     size_t *size = (size_t *) accum;
     if (term_is_integer(t)) {
         *size += 1;
-    } else if (term_is_binary(t)) {
+    } else /* term_is_binary(t) */ {
         *size += term_binary_size(t);
     }
     return InteropOk;
@@ -241,7 +261,7 @@ static inline InteropFunctionResult size_fold_fun(term t, void *accum)
 InteropFunctionResult interop_iolist_size(term t, size_t *size)
 {
     *size = 0;
-    return interop_iolist_fold(t, size_fold_fun, size);
+    return interop_chardata_fold(t, size_fold_fun, NULL, size);
 }
 
 static inline InteropFunctionResult write_string_fold_fun(term t, void *accum)
@@ -250,7 +270,7 @@ static inline InteropFunctionResult write_string_fold_fun(term t, void *accum)
     if (term_is_integer(t)) {
         **p = term_to_int(t);
         (*p)++;
-    } else if (term_is_binary(t)) {
+    } else /* term_is_binary(t) */ {
         int len = term_binary_size(t);
         memcpy(*p, term_binary_data(t), len);
         *p += len;
@@ -260,7 +280,284 @@ static inline InteropFunctionResult write_string_fold_fun(term t, void *accum)
 
 InteropFunctionResult interop_write_iolist(term t, char *p)
 {
-    return interop_iolist_fold(t, write_string_fold_fun, (void *) &p);
+    return interop_chardata_fold(t, write_string_fold_fun, NULL, (void *) &p);
+}
+
+static enum UnicodeConversionResult interop_binary_conversion(term t, uint8_t *output, size_t *output_len, size_t *rest_crsr, enum CharDataEncoding in_encoding, enum CharDataEncoding out_encoding)
+{
+    size_t len = term_binary_size(t);
+    if (in_encoding == Latin1Encoding && out_encoding == Latin1Encoding) {
+        if (output) {
+            memcpy(output, term_binary_data(t), len);
+        }
+        *output_len = len;
+        return UnicodeOk;
+    }
+    size_t result = 0;
+    size_t input_index;
+    const uint8_t *input = (const uint8_t *) term_binary_data(t);
+    if (in_encoding == Latin1Encoding) {
+        for (input_index = 0; input_index < len; input_index++) {
+            if (out_encoding == UTF8Encoding) {
+                size_t char_size;
+                if (UNLIKELY(!bitstring_utf8_encode(input[input_index], output, &char_size))) {
+                    *rest_crsr = input_index;
+                    *output_len = result;
+                    return UnicodeError;
+                }
+                result += char_size;
+                if (output) {
+                    output += char_size;
+                }
+            } else {
+                // UCS4Native
+                result += sizeof(uint32_t);
+                if (output) {
+                    *((uint32_t *) output) = input[input_index];
+                    output += sizeof(uint32_t);
+                }
+            }
+        }
+        *output_len = result;
+        return UnicodeOk;
+    }
+    input_index = 0;
+    while (input_index < len) {
+        size_t char_size;
+        uint32_t c;
+        enum UnicodeTransformDecodeResult decode_result = bitstring_utf8_decode(input + input_index, len - input_index, &c, &char_size);
+        if (UNLIKELY(decode_result != UnicodeTransformDecodeSuccess)) {
+            *rest_crsr = input_index;
+            *output_len = result;
+            return decode_result == UnicodeTransformDecodeIncomplete ? UnicodeIncompleteTransform : UnicodeError;
+        }
+        switch (out_encoding) {
+            case Latin1Encoding: {
+                if (c > 255) {
+                    *rest_crsr = input_index;
+                    *output_len = result;
+                    return UnicodeError;
+                }
+                if (output) {
+                    *output++ = c;
+                }
+                result++;
+            } break;
+            case UTF8Encoding: {
+                if (output) {
+                    memcpy(output, input + input_index, char_size);
+                    output += char_size;
+                }
+                result += char_size;
+            } break;
+            case UCS4NativeEncoding: {
+                if (output) {
+                    *((uint32_t *) output) = c;
+                    output += sizeof(uint32_t);
+                }
+                result += sizeof(uint32_t);
+            } break;
+        }
+        input_index += char_size;
+    }
+    *output_len = result;
+    return UnicodeOk;
+}
+
+struct CharDataToBytesSizeAcc
+{
+    enum CharDataEncoding in_encoding;
+    enum CharDataEncoding out_encoding;
+    size_t size;
+    size_t rest_size;
+    bool badarg;
+    bool incomplete_transform;
+};
+
+static InteropFunctionResult chardata_to_bytes_size_fold_fun(term t, void *accum)
+{
+    struct CharDataToBytesSizeAcc *acc = (struct CharDataToBytesSizeAcc *) accum;
+    if (term_is_binary(t)) {
+        size_t bin_size;
+        size_t rest_crsr;
+        enum UnicodeConversionResult conv_result = interop_binary_conversion(t, NULL, &bin_size, &rest_crsr, acc->in_encoding, acc->out_encoding);
+        acc->size += bin_size;
+        if (UNLIKELY(conv_result != UnicodeOk)) {
+            acc->rest_size = term_sub_binary_heap_size(t, term_binary_size(t) - rest_crsr);
+            acc->incomplete_transform = conv_result == UnicodeIncompleteTransform;
+            return InteropBadArg;
+        }
+    } else /* term_is_integer(t) */ {
+        avm_int_t c = term_to_int(t);
+        if (c < 0) {
+            return InteropBadArg;
+        }
+        switch (acc->out_encoding) {
+            case Latin1Encoding: {
+                if (c > 255) {
+                    return InteropBadArg;
+                }
+                acc->size++;
+            } break;
+            case UTF8Encoding: {
+                size_t char_size;
+                if (UNLIKELY(!bitstring_utf8_encode(c, NULL, &char_size))) {
+                    return InteropBadArg;
+                }
+                acc->size += char_size;
+            } break;
+            case UCS4NativeEncoding: {
+                acc->size += sizeof(uint32_t);
+            } break;
+        }
+    }
+    return InteropOk;
+}
+
+static void chardata_to_bytes_size_rest_fun(term t, void *accum)
+{
+    struct CharDataToBytesSizeAcc *acc = (struct CharDataToBytesSizeAcc *) accum;
+    if (!term_is_binary(t) && !term_is_integer(t) && !term_is_list(t)) {
+        acc->badarg = true;
+    }
+    if (!acc->badarg) {
+        if (!term_is_nil(t)) {
+            acc->incomplete_transform = false;
+        }
+        acc->rest_size += CONS_SIZE;
+    }
+}
+
+enum UnicodeConversionResult interop_chardata_to_bytes_size(term t, size_t *size, size_t *rest_size, enum CharDataEncoding in_encoding, enum CharDataEncoding out_encoding)
+{
+    struct CharDataToBytesSizeAcc acc = {
+        .in_encoding = in_encoding,
+        .out_encoding = out_encoding,
+        .size = 0,
+        .rest_size = 0,
+        .badarg = false,
+        .incomplete_transform = false
+    };
+    InteropFunctionResult res = interop_chardata_fold(t, chardata_to_bytes_size_fold_fun, chardata_to_bytes_size_rest_fun, &acc);
+    if (UNLIKELY(res == InteropMemoryAllocFail)) {
+        return UnicodeMemoryAllocFail;
+    }
+    if (acc.badarg) {
+        return UnicodeBadArg;
+    }
+    *size = acc.size;
+    if (rest_size) {
+        *rest_size = acc.rest_size;
+    }
+    if (acc.incomplete_transform) {
+        return UnicodeIncompleteTransform;
+    }
+    return res == InteropOk ? UnicodeOk : UnicodeError;
+}
+
+struct CharDataToBytesAcc
+{
+    enum CharDataEncoding in_encoding;
+    enum CharDataEncoding out_encoding;
+    uint8_t *output;
+    term *rest;
+    Heap *heap;
+    bool badarg;
+    bool incomplete_transform;
+};
+
+static InteropFunctionResult chardata_to_bytes_fold_fun(term t, void *accum)
+{
+    struct CharDataToBytesAcc *acc = (struct CharDataToBytesAcc *) accum;
+    if (term_is_binary(t)) {
+        size_t bin_size;
+        size_t rest_crsr;
+        enum UnicodeConversionResult conv_result = interop_binary_conversion(t, acc->output, &bin_size, &rest_crsr, acc->in_encoding, acc->out_encoding);
+        acc->output += bin_size;
+        if (UNLIKELY(conv_result != UnicodeOk)) {
+            if (acc->rest) {
+                *acc->rest = term_alloc_sub_binary(t, rest_crsr, term_binary_size(t) - rest_crsr, acc->heap);
+            }
+            if (conv_result == UnicodeIncompleteTransform) {
+                acc->incomplete_transform = true;
+            }
+            return InteropBadArg;
+        }
+    } else /* term_is_integer(t) */ {
+        avm_int_t c = term_to_int(t);
+        if (c < 0) {
+            if (acc->rest) {
+                *acc->rest = t;
+            }
+            return InteropBadArg;
+        }
+        switch (acc->out_encoding) {
+            case Latin1Encoding: {
+                if (c > 255) {
+                    if (acc->rest) {
+                        *acc->rest = t;
+                    }
+                    return InteropBadArg;
+                }
+                *acc->output++ = (uint8_t) c;
+            } break;
+            case UTF8Encoding: {
+                size_t char_size;
+                if (UNLIKELY(!bitstring_utf8_encode(c, acc->output, &char_size))) {
+                    if (acc->rest) {
+                        *acc->rest = t;
+                    }
+                    return InteropBadArg;
+                }
+                acc->output += char_size;
+            } break;
+            case UCS4NativeEncoding: {
+                *((uint32_t *) acc->output) = c;
+                acc->output += sizeof(uint32_t);
+            } break;
+        }
+    }
+    return InteropOk;
+}
+
+static void chardata_to_bytes_rest_fun(term t, void *accum)
+{
+    struct CharDataToBytesAcc *acc = (struct CharDataToBytesAcc *) accum;
+    if (!term_is_binary(t) && !term_is_integer(t) && !term_is_list(t)) {
+        acc->badarg = true;
+    }
+    if (!acc->badarg) {
+        if (!term_is_nil(t)) {
+            acc->incomplete_transform = false;
+        }
+        if (acc->rest) {
+            *acc->rest = term_list_prepend(*acc->rest, t, acc->heap);
+        }
+    }
+}
+
+enum UnicodeConversionResult interop_chardata_to_bytes(term t, uint8_t *output, term *rest, enum CharDataEncoding in_encoding, enum CharDataEncoding out_encoding, Heap *heap)
+{
+    struct CharDataToBytesAcc acc = {
+        .in_encoding = in_encoding,
+        .out_encoding = out_encoding,
+        .output = output,
+        .rest = rest,
+        .heap = heap,
+        .badarg = false,
+        .incomplete_transform = false
+    };
+    InteropFunctionResult res = interop_chardata_fold(t, chardata_to_bytes_fold_fun, chardata_to_bytes_rest_fun, &acc);
+    if (UNLIKELY(res == InteropMemoryAllocFail)) {
+        return UnicodeMemoryAllocFail;
+    }
+    if (acc.badarg) {
+        return UnicodeBadArg;
+    }
+    if (acc.incomplete_transform) {
+        return UnicodeIncompleteTransform;
+    }
+    return res == InteropOk ? UnicodeOk : UnicodeError;
 }
 
 term interop_map_get_value(GlobalContext *glb, term map, term key)
diff --git a/src/libAtomVM/interop.h b/src/libAtomVM/interop.h
index c1d92b836..1c05ae5ea 100644
--- a/src/libAtomVM/interop.h
+++ b/src/libAtomVM/interop.h
@@ -35,6 +35,15 @@ typedef enum
     InteropBadArg
 } InteropFunctionResult;
 
+enum UnicodeConversionResult
+{
+    UnicodeOk = InteropOk,
+    UnicodeMemoryAllocFail = InteropMemoryAllocFail,
+    UnicodeBadArg = InteropBadArg,
+    UnicodeError,
+    UnicodeIncompleteTransform
+};
+
 /**
  * An idiomatic macro for marking an AtomStringIntPair table entry as a
  * interop_atom_term_select_int default.
@@ -53,7 +62,8 @@ typedef struct
     int i_val;
 } AtomStringIntPair;
 
-typedef InteropFunctionResult (*interop_iolist_fold_fun)(term t, void *accum);
+typedef InteropFunctionResult (*interop_chardata_fold_fun)(term t, void *accum);
+typedef void (*interop_chardata_rest_fun)(term t, void *accum);
 
 char *interop_term_to_string(term t, int *ok);
 char *interop_binary_to_string(term binary);
@@ -67,7 +77,17 @@ term interop_map_get_value_default(GlobalContext *glb, term map, term key, term
 
 NO_DISCARD InteropFunctionResult interop_iolist_size(term t, size_t *size);
 NO_DISCARD InteropFunctionResult interop_write_iolist(term t, char *p);
-NO_DISCARD InteropFunctionResult interop_iolist_fold(term t, interop_iolist_fold_fun fold_fun, void *accum);
+NO_DISCARD InteropFunctionResult interop_chardata_fold(term t, interop_chardata_fold_fun fold_fun, interop_chardata_rest_fun rest_fun, void *accum);
+
+enum CharDataEncoding
+{
+    Latin1Encoding,
+    UTF8Encoding,
+    UCS4NativeEncoding // Only available for output for characters_to_list
+};
+
+NO_DISCARD enum UnicodeConversionResult interop_chardata_to_bytes_size(term t, size_t *size, size_t *rest_size, enum CharDataEncoding in_encoding, enum CharDataEncoding out_encoding);
+NO_DISCARD enum UnicodeConversionResult interop_chardata_to_bytes(term t, uint8_t *output, term *rest, enum CharDataEncoding in_encoding, enum CharDataEncoding out_encoding, Heap *heap);
 
 /**
  * @brief Finds on a table the first matching atom string.
diff --git a/src/libAtomVM/nifs.c b/src/libAtomVM/nifs.c
index bf9083407..caa43ed52 100644
--- a/src/libAtomVM/nifs.c
+++ b/src/libAtomVM/nifs.c
@@ -164,6 +164,8 @@ static term nif_base64_decode_to_string(Context *ctx, int argc, term argv[]);
 static term nif_code_load_abs(Context *ctx, int argc, term argv[]);
 static term nif_code_load_binary(Context *ctx, int argc, term argv[]);
 static term nif_maps_next(Context *ctx, int argc, term argv[]);
+static term nif_unicode_characters_to_list(Context *ctx, int argc, term argv[]);
+static term nif_unicode_characters_to_binary(Context *ctx, int argc, term argv[]);
 
 #define DECLARE_MATH_NIF_FUN(moniker) \
     static term nif_math_##moniker(Context *ctx, int argc, term argv[]);
@@ -706,6 +708,16 @@ static const struct Nif maps_next_nif =
     .base.type = NIFFunctionType,
     .nif_ptr = nif_maps_next
 };
+static const struct Nif unicode_characters_to_list_nif =
+{
+    .base.type = NIFFunctionType,
+    .nif_ptr = nif_unicode_characters_to_list
+};
+static const struct Nif unicode_characters_to_binary_nif =
+{
+    .base.type = NIFFunctionType,
+    .nif_ptr = nif_unicode_characters_to_binary
+};
 
 #define DEFINE_MATH_NIF(moniker)                    \
     static const struct Nif math_##moniker##_nif =  \
@@ -4216,6 +4228,118 @@ static term nif_maps_next(Context *ctx, int argc, term argv[])
     return ret;
 }
 
+static term nif_unicode_characters_to_list(Context *ctx, int argc, term argv[])
+{
+    enum CharDataEncoding in_encoding = UTF8Encoding;
+    if (argc == 2) {
+        if (argv[1] == LATIN1_ATOM) {
+            in_encoding = Latin1Encoding;
+        } else if (UNLIKELY((argv[1] != UTF8_ATOM))) {
+            RAISE_ERROR(BADARG_ATOM);
+        }
+    }
+    size_t size;
+    size_t rest_size;
+    enum UnicodeConversionResult conv_result = interop_chardata_to_bytes_size(argv[0], &size, &rest_size, in_encoding, UCS4NativeEncoding);
+    if (UNLIKELY(conv_result == UnicodeMemoryAllocFail)) {
+        RAISE_ERROR(OUT_OF_MEMORY_ATOM);
+    }
+    if (UNLIKELY(conv_result == UnicodeBadArg)) {
+        RAISE_ERROR(BADARG_ATOM);
+    }
+    size_t len = size / sizeof(uint32_t);
+    uint32_t *chars = malloc(size);
+    if (IS_NULL_PTR(chars)) {
+        RAISE_ERROR(OUT_OF_MEMORY_ATOM);
+    }
+    size_t needed_terms = CONS_SIZE * len;
+    if (UNLIKELY(conv_result == UnicodeError || conv_result == UnicodeIncompleteTransform)) {
+        needed_terms += rest_size + TUPLE_SIZE(3);
+    }
+    if (UNLIKELY(conv_result == UnicodeBadArg)) {
+        free(chars);
+        RAISE_ERROR(BADARG_ATOM);
+    }
+    if (UNLIKELY(memory_ensure_free(ctx, needed_terms) != MEMORY_GC_OK)) {
+        free(chars);
+        RAISE_ERROR(OUT_OF_MEMORY_ATOM);
+    }
+    term rest;
+    conv_result = interop_chardata_to_bytes(argv[0], (uint8_t *) chars, &rest, in_encoding, UCS4NativeEncoding, &ctx->heap);
+    if (UNLIKELY(conv_result == UnicodeMemoryAllocFail)) {
+        free(chars);
+        RAISE_ERROR(OUT_OF_MEMORY_ATOM);
+    }
+    term result = term_nil();
+    uint32_t *crsr = chars + len - 1;
+    for (size_t index_list = len; index_list > 0; index_list--) {
+        result = term_list_prepend(term_from_int(*crsr--), result, &ctx->heap);
+    }
+    free(chars);
+    if (LIKELY(conv_result == UnicodeOk)) {
+        return result;
+    }
+    term result_tuple = term_alloc_tuple(3, &ctx->heap);
+    term_put_tuple_element(result_tuple, 0, conv_result == UnicodeError ? ERROR_ATOM : INCOMPLETE_ATOM);
+    term_put_tuple_element(result_tuple, 1, result);
+    term_put_tuple_element(result_tuple, 2, rest);
+    return result_tuple;
+}
+
+static term nif_unicode_characters_to_binary(Context *ctx, int argc, term argv[])
+{
+    enum CharDataEncoding in_encoding = UTF8Encoding;
+    enum CharDataEncoding out_encoding = UTF8Encoding;
+    if (argc > 1) {
+        if (argv[1] == LATIN1_ATOM) {
+            in_encoding = Latin1Encoding;
+        } else if (UNLIKELY((argv[1] != UTF8_ATOM))) {
+            RAISE_ERROR(BADARG_ATOM);
+        }
+        if (argc == 3) {
+            if (argv[2] == LATIN1_ATOM) {
+                out_encoding = Latin1Encoding;
+            } else if (UNLIKELY((argv[2] != UTF8_ATOM))) {
+                RAISE_ERROR(BADARG_ATOM);
+            }
+        }
+    }
+    size_t len;
+    size_t rest_size;
+    enum UnicodeConversionResult conv_result = interop_chardata_to_bytes_size(argv[0], &len, &rest_size, in_encoding, out_encoding);
+    if (UNLIKELY(conv_result == UnicodeMemoryAllocFail)) {
+        RAISE_ERROR(OUT_OF_MEMORY_ATOM);
+    }
+    if (UNLIKELY(conv_result == UnicodeBadArg)) {
+        RAISE_ERROR(BADARG_ATOM);
+    }
+    size_t needed_terms = term_binary_data_size_in_terms(len);
+    if (UNLIKELY(conv_result == UnicodeError || conv_result == UnicodeIncompleteTransform)) {
+        needed_terms += TUPLE_SIZE(3) + rest_size;
+    }
+    if (UNLIKELY(conv_result == UnicodeBadArg)) {
+        RAISE_ERROR(BADARG_ATOM);
+    }
+    if (UNLIKELY(memory_ensure_free(ctx, needed_terms) != MEMORY_GC_OK)) {
+        RAISE_ERROR(OUT_OF_MEMORY_ATOM);
+    }
+    term result = term_create_uninitialized_binary(len, &ctx->heap, ctx->global);
+    uint8_t *binary_data = (uint8_t *) term_binary_data(result);
+    term rest;
+    conv_result = interop_chardata_to_bytes(argv[0], binary_data, &rest, in_encoding, out_encoding, &ctx->heap);
+    if (UNLIKELY(conv_result == UnicodeMemoryAllocFail)) {
+        RAISE_ERROR(OUT_OF_MEMORY_ATOM);
+    }
+    if (LIKELY(conv_result == UnicodeOk)) {
+        return result;
+    }
+    term result_tuple = term_alloc_tuple(3, &ctx->heap);
+    term_put_tuple_element(result_tuple, 0, conv_result == UnicodeError ? ERROR_ATOM : INCOMPLETE_ATOM);
+    term_put_tuple_element(result_tuple, 1, result);
+    term_put_tuple_element(result_tuple, 2, rest);
+    return result_tuple;
+}
+
 //
 // MAINTENANCE NOTE: Exception handling for fp operations using math
 // error handling is designed to be thread-safe, as errors are specified
diff --git a/src/libAtomVM/nifs.gperf b/src/libAtomVM/nifs.gperf
index 610af249e..65f7bbaf4 100644
--- a/src/libAtomVM/nifs.gperf
+++ b/src/libAtomVM/nifs.gperf
@@ -142,6 +142,11 @@ base64:decode/1, &base64_decode_nif
 base64:encode_to_string/1, &base64_encode_to_string_nif
 base64:decode_to_string/1, &base64_decode_to_string_nif
 maps:next/1, &maps_next_nif
+unicode:characters_to_list/1, &unicode_characters_to_list_nif
+unicode:characters_to_list/2, &unicode_characters_to_list_nif
+unicode:characters_to_binary/1, &unicode_characters_to_binary_nif
+unicode:characters_to_binary/2, &unicode_characters_to_binary_nif
+unicode:characters_to_binary/3, &unicode_characters_to_binary_nif
 math:acos/1, &math_acos_nif
 math:acosh/1, &math_acosh_nif
 math:asin/1, &math_asin_nif
diff --git a/src/libAtomVM/opcodesswitch.h b/src/libAtomVM/opcodesswitch.h
index e2640d9e6..6707320df 100644
--- a/src/libAtomVM/opcodesswitch.h
+++ b/src/libAtomVM/opcodesswitch.h
@@ -3954,7 +3954,7 @@ HOT_FUNC int scheduler_entry_point(GlobalContext *glb)
                     term src_bin = term_get_match_state_binary(src);
                     avm_int_t offset_bits = term_get_match_state_offset(src);
 
-                    int32_t val = 0;
+                    uint32_t val = 0;
                     size_t out_size = 0;
                     bool is_valid = bitstring_match_utf8(src_bin, (size_t) offset_bits, &val, &out_size);
 
@@ -3994,7 +3994,7 @@ HOT_FUNC int scheduler_entry_point(GlobalContext *glb)
                     term src_bin = term_get_match_state_binary(src);
                     avm_int_t offset_bits = term_get_match_state_offset(src);
 
-                    int32_t c = 0;
+                    uint32_t c = 0;
                     size_t out_size = 0;
                     bool is_valid = bitstring_match_utf8(src_bin, (size_t) offset_bits, &c, &out_size);
 
diff --git a/tests/erlang_tests/CMakeLists.txt b/tests/erlang_tests/CMakeLists.txt
index 50dafb53d..54f5bfc93 100644
--- a/tests/erlang_tests/CMakeLists.txt
+++ b/tests/erlang_tests/CMakeLists.txt
@@ -265,6 +265,7 @@ compile_erlang(test_integer_to_binary)
 compile_erlang(test_list_to_binary)
 compile_erlang(test_binary_to_list)
 compile_erlang(test_atom_to_binary)
+compile_erlang(test_unicode)
 
 compile_erlang(test_binary_part)
 compile_erlang(test_binary_split)
@@ -702,6 +703,7 @@ add_custom_target(erlang_test_modules DEPENDS
     test_list_to_binary.beam
     test_binary_to_list.beam
     test_atom_to_binary.beam
+    test_unicode.beam
 
     test_binary_part.beam
     test_binary_split.beam
diff --git a/tests/erlang_tests/test_unicode.erl b/tests/erlang_tests/test_unicode.erl
new file mode 100644
index 000000000..b7a12dc1f
--- /dev/null
+++ b/tests/erlang_tests/test_unicode.erl
@@ -0,0 +1,147 @@
+%
+% This file is part of AtomVM.
+%
+% Copyright 2023 Paul Guyot <pguyot@kallisys.net>
+%
+% Licensed under the Apache License, Version 2.0 (the "License");
+% you may not use this file except in compliance with the License.
+% You may obtain a copy of the License at
+%
+%    http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS,
+% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+% See the License for the specific language governing permissions and
+% limitations under the License.
+%
+% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
+%
+
+-module(test_unicode).
+
+-export([start/0]).
+
+start() ->
+    ok = test_to_list_latin1(),
+    ok = test_to_list_utf8(),
+    ok = test_to_binary_latin1(),
+    ok = test_to_binary_utf8(),
+    0.
+
+test_to_list_latin1() ->
+    "hello" = unicode:characters_to_list(<<"hello">>, latin1),
+    "hello" = unicode:characters_to_list("hello", latin1),
+    "hé" = unicode:characters_to_list(<<"hé">>, latin1),
+    "hé" = unicode:characters_to_list(<<"hé">>, latin1),
+    {error, "h", [-1]} = unicode:characters_to_list([$h, -1], latin1),
+    {error, "h", [-1 | "ello"]} = unicode:characters_to_list([$h, -1 | "ello"], latin1),
+    {error, "fooh", [[-1 | "ello"], "bar"]} = unicode:characters_to_list(
+        ["foo", [$h, -1 | "ello"], "bar"], latin1
+    ),
+    ok =
+        try
+            unicode:characters_to_list([$h, self()], latin1),
+            fail
+        catch
+            error:badarg -> ok
+        end,
+    ok.
+
+test_to_list_utf8() ->
+    "hello" = unicode:characters_to_list(<<"hello">>),
+    "hello" = unicode:characters_to_list("hello", utf8),
+    "hé" = unicode:characters_to_list(<<"hé"/utf8>>),
+    "hé" = unicode:characters_to_list("hé"),
+    "hé" = unicode:characters_to_list(<<"hé"/utf8>>, utf8),
+    {error, "h", [-1]} = unicode:characters_to_list([$h, -1], utf8),
+    {incomplete, "h", <<"é">>} = unicode:characters_to_list(<<"hé">>),
+    {error, [], <<16#A0, 16#A1>>} = unicode:characters_to_list(<<16#A0, 16#A1>>),
+    % Erlang/OTP documentation writes: "The last part is mostly for debugging"
+    % BEAM and ATOM representation differ a little bit
+    Expected1 =
+        case erlang:system_info(machine) of
+            "BEAM" -> [<<"é">>, [[["bar"]]]];
+            "ATOM" -> [[<<"é">>, ["bar"]]]
+        end,
+    {error, "fooh", Expected1} = unicode:characters_to_list(["foo", [<<"hé">>, ["bar"]]]),
+    Expected2 =
+        case erlang:system_info(machine) of
+            "BEAM" -> [<<"é">>, [[["bar"], "foobar"]]];
+            "ATOM" -> [[<<"é">>, ["bar"], "foobar"]]
+        end,
+    {error, "fooh", Expected2} = unicode:characters_to_list(["foo", [<<"hé">>, ["bar"], "foobar"]]),
+    ok.
+
+test_to_binary_latin1() ->
+    <<"hello">> = unicode:characters_to_binary("hello", latin1, latin1),
+    <<"hello">> = unicode:characters_to_binary(<<"hello">>, latin1, latin1),
+    <<"hé">> = unicode:characters_to_binary("hé", latin1, latin1),
+    <<"hé">> = unicode:characters_to_binary(<<"hé"/utf8>>, utf8, latin1),
+    % For some reason, Erlang/OTP fails on -1 for latin1
+    ok =
+        case erlang:system_info(machine) of
+            "BEAM" ->
+                try
+                    unicode:characters_to_binary([$h, -1], latin1, latin1)
+                catch
+                    error:badarg -> ok
+                end;
+            "ATOM" ->
+                {error, <<"h">>, [-1]} = unicode:characters_to_binary([$h, -1], latin1, latin1),
+                ok
+        end,
+    Expected1 =
+        case erlang:system_info(machine) of
+            "BEAM" -> [[2000]];
+            "ATOM" -> [2000]
+        end,
+    {error, <<"h">>, Expected1} = unicode:characters_to_binary([$h, 2000], latin1, latin1),
+    Expected2 =
+        case erlang:system_info(machine) of
+            "BEAM" -> [[2000] | "ello"];
+            "ATOM" -> [2000 | "ello"]
+        end,
+    {error, <<"h">>, Expected2} = unicode:characters_to_binary([$h, 2000 | "ello"], latin1, latin1),
+    Expected3 =
+        case erlang:system_info(machine) of
+            "BEAM" -> [[[2000] | "ello"], "bar"];
+            "ATOM" -> [[2000 | "ello"], "bar"]
+        end,
+    {error, <<"fooh">>, Expected3} = unicode:characters_to_binary(
+        ["foo", [$h, 2000 | "ello"], "bar"], latin1, latin1
+    ),
+    ok =
+        try
+            unicode:characters_to_binary([$h, self()], latin1, latin1),
+            fail
+        catch
+            error:badarg -> ok
+        end,
+    ok.
+
+test_to_binary_utf8() ->
+    <<"hello">> = unicode:characters_to_binary("hello", utf8, utf8),
+    <<"hello">> = unicode:characters_to_binary(<<"hello">>, utf8, utf8),
+    <<"hé"/utf8>> = unicode:characters_to_binary("hé", latin1, utf8),
+    <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>, utf8, utf8),
+    <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>, utf8),
+    <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>),
+    {error, <<"h">>, [-1]} = unicode:characters_to_binary([$h, -1]),
+    {incomplete, <<"h">>, <<"é">>} = unicode:characters_to_binary(<<"hé">>),
+    {error, <<>>, <<16#A0, 16#A1>>} = unicode:characters_to_binary(<<16#A0, 16#A1>>),
+    Expected1 =
+        case erlang:system_info(machine) of
+            "BEAM" -> [<<"é">>, [[["bar"]]]];
+            "ATOM" -> [[<<"é">>, ["bar"]]]
+        end,
+    {error, <<"fooh">>, Expected1} = unicode:characters_to_binary(["foo", [<<"hé">>, ["bar"]]]),
+    Expected2 =
+        case erlang:system_info(machine) of
+            "BEAM" -> [<<"é">>, [[["bar"], "foobar"]]];
+            "ATOM" -> [[<<"é">>, ["bar"], "foobar"]]
+        end,
+    {error, <<"fooh">>, Expected2} = unicode:characters_to_binary([
+        "foo", [<<"hé">>, ["bar"], "foobar"]
+    ]),
+    ok.
diff --git a/tests/libs/estdlib/test_io_lib.erl b/tests/libs/estdlib/test_io_lib.erl
index d2521327f..30afdc00a 100644
--- a/tests/libs/estdlib/test_io_lib.erl
+++ b/tests/libs/estdlib/test_io_lib.erl
@@ -146,8 +146,8 @@ test() ->
     %   ?ASSERT_MATCH(?FLT(io_lib:format("~-3p", ["foobar"])), "foo"),
     ?ASSERT_MATCH(?FLT(io_lib:format("~3s", ["foobar"])), "foo"),
     ?ASSERT_MATCH(?FLT(io_lib:format("~s", [<<"hé"/utf8>>])), [104, 195, 169]),
-    %   ?ASSERT_MATCH(?FLT(io_lib:format("~ts", [<<"hé"/utf8>>])), [104,233]),
-    %   ?ASSERT_MATCH(?FLT(io_lib:format("~ts", [<<"hé"/utf8, 223>>])), [104,195,169,223]),
+    ?ASSERT_MATCH(?FLT(io_lib:format("~ts", [<<"hé"/utf8>>])), [104, 233]),
+    ?ASSERT_MATCH(?FLT(io_lib:format("~ts", [<<"hé"/utf8, 223>>])), [104, 195, 169, 223]),
     ?ASSERT_MATCH(?FLT(io_lib:format("~-3s", ["foobar"])), "foo"),
     ?ASSERT_MATCH(?FLT(io_lib:format("~3s", ["foo"])), "foo"),
     ?ASSERT_MATCH(?FLT(io_lib:format("~-3s", ["foo"])), "foo"),
diff --git a/tests/test.c b/tests/test.c
index caf1fbd04..c170a6a5a 100644
--- a/tests/test.c
+++ b/tests/test.c
@@ -281,6 +281,7 @@ struct Test tests[] = {
     TEST_CASE(test_list_to_binary),
     TEST_CASE_EXPECTED(test_binary_to_list, 0),
     TEST_CASE_EXPECTED(test_atom_to_binary, 1),
+    TEST_CASE(test_unicode),
 
     TEST_CASE_EXPECTED(test_binary_part, 12),
     TEST_CASE_EXPECTED(test_binary_split, 16),