erlang · bjorng · Feb 1, 2017 · May 31, 2016 · josevalim · Sep 26, 2016
diff --git a/erts/doc/src/erl_ext_dist.xml b/erts/doc/src/erl_ext_dist.xml
@@ -119,16 +119,11 @@
     <tcaption>Compressed Data Format when Expanded</tcaption></table>
     <marker id="utf8_atoms"/>
     <note>
-      <p>As from ERTS 5.10 (OTP R16) support
-        for UTF-8 encoded atoms has been introduced in the external format.
-        However, only characters that can be encoded using Latin-1 (ISO-8859-1)
-        are currently supported in atoms. The support for UTF-8 encoded atoms
-        in the external format has been implemented to be able to support
-        all Unicode characters in atoms in <em>some future release</em>.
-        Until full Unicode support for atoms has been introduced,
-        it is an <em>error</em> to pass atoms containing
-        characters that cannot be encoded in Latin-1, and <em>the behavior is
-        undefined</em>.</p>
+      <p>As from ERTS 9.0 (OTP 20), UTF-8 encoded atoms may contain any Unicode
+        character. Although the support for UTF-8 encoded atoms in the external
+        format is available since ERTS 5.10 (OTP R16), passing atoms that cannot
+        be encoded in Latin-1 is an <em>error</em> in versions earlier than
+        Erlang/OTP 20, and <em>the behavior is undefined</em>.</p>
       <p>When distribution flag <seealso marker="erl_dist_protocol#dflags">
         <c>DFLAG_UTF8_ATOMS</c></seealso> has been exchanged between both nodes
         in the <seealso marker="erl_dist_protocol#distribution_handshake">

diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml
@@ -325,16 +325,11 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code>
           is <c>latin1</c>, one byte exists for each character
           in the text representation. If <c><anno>Encoding</anno></c> is
           <c>utf8</c> or
-          <c>unicode</c>, the characters are encoded using UTF-8
-          (that is, characters from 128 through 255 are
-          encoded in two bytes).</p>
+          <c>unicode</c>, the characters are encoded using UTF-8 where
+          characters may require multiple bytes.</p>
         <note>
-          <p><c>atom_to_binary(<anno>Atom</anno>, latin1)</c> never
-            fails, as the text representation of an atom can only
-            contain characters from 0 through 255. In a future release,
-            the text representation
-            of atoms can be allowed to contain any Unicode character and
-            <c>atom_to_binary(<anno>Atom</anno>, latin1)</c> then fails if the
+          <p>As from Erlang/OTP 20, atoms can contain any Unicode character
+            and <c>atom_to_binary(<anno>Atom</anno>, latin1)</c> may fail if the
             text representation for <c><anno>Atom</anno></c> contains a Unicode
             character &gt; 255.</p>
         </note>
@@ -402,13 +397,11 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code>
           translation of bytes in the binary is done.
           If <c><anno>Encoding</anno></c>
           is <c>utf8</c> or <c>unicode</c>, the binary must contain
-          valid UTF-8 sequences. Only Unicode characters up
-          to 255 are allowed.</p>
+          valid UTF-8 sequences.</p>
         <note>
-          <p><c>binary_to_atom(<anno>Binary</anno>, utf8)</c> fails if
-            the binary contains Unicode characters &gt; 255.
-            In a future release, such Unicode characters can be allowed and
-            <c>binary_to_atom(<anno>Binary</anno>, utf8)</c> does then not fail.
+          <p>As from Erlang/OTP 20, <c>binary_to_atom(<anno>Binary</anno>, utf8)</c>
+            is capable of encoding any Unicode character. Earlier versions would
+            fail if the binary contained Unicode characters &gt; 255.
             For more information about Unicode support in atoms, see the
             <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8
             encoded atoms</seealso>
@@ -419,9 +412,7 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code>
 > <input>binary_to_atom(&lt;&lt;"Erlang"&gt;&gt;, latin1).</input>
 'Erlang'
 > <input>binary_to_atom(&lt;&lt;1024/utf8&gt;&gt;, utf8).</input>
-** exception error: bad argument
-     in function  binary_to_atom/2
-        called as binary_to_atom(&lt;&lt;208,128&gt;&gt;,utf8)</pre>
+'Ѐ'</pre>
       </desc>
     </func>
 
@@ -2401,10 +2392,10 @@ os_prompt%</pre>
       <desc>
         <p>Returns the atom whose text representation is
           <c><anno>String</anno></c>.</p>
-        <p><c><anno>String</anno></c> can only contain ISO-latin-1
-          characters (that is, numbers &lt; 256) as the implementation does not
-          allow Unicode characters equal to or above 256 in atoms.
-          For more information on Unicode support in atoms, see
+        <p>As from Erlang/OTP 20, <c><anno>String</anno></c> may contain
+          any Unicode character. Earlier versions allowed only ISO-latin-1
+          characters as the implementation did not allow Unicode characters
+          above 255. For more information on Unicode support in atoms, see
           <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8
           encoded atoms</seealso>
           in section "External Term Format" in the User's Guide.</p>

diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
@@ -233,10 +233,10 @@ static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp)
 }
 
 /*
- * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ * erts_atom_put_index() may fail. Returns negative indexes for errors.
  */
-Eterm
-erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
+int
+erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 {
     byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
     const byte *text = name;
@@ -253,7 +253,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 	if (trunc)
 	    tlen = 0;
 	else
-	    return THE_NON_VALUE;
+	    return ATOM_MAX_CHARS_ERROR;
     }
 
     switch (enc) {
@@ -262,7 +262,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 	    if (trunc)
 		tlen = MAX_ATOM_CHARACTERS;
 	    else
-		return THE_NON_VALUE;
+		return ATOM_MAX_CHARS_ERROR;
 	}
 #ifdef DEBUG
 	for (aix = 0; aix < len; aix++) {
@@ -276,15 +276,15 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 	    if (trunc)
 		tlen = MAX_ATOM_CHARACTERS;
 	    else
-		return THE_NON_VALUE;
+		return ATOM_MAX_CHARS_ERROR;
 	}
 	no_latin1_chars = tlen;
 	latin1_to_utf8(utf8_copy, &text, &tlen);
 	break;
     case ERTS_ATOM_ENC_UTF8:
 	/* First sanity check; need to verify later */
 	if (tlen > MAX_ATOM_SZ_LIMIT && !trunc)
-	    return THE_NON_VALUE;
+	    return ATOM_MAX_CHARS_ERROR;
 	break;
     }
 
@@ -295,7 +295,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
     atom_read_unlock();
     if (aix >= 0) {
 	/* Already in table no need to verify it */
-	return make_atom(aix);
+	return aix;
     }
 
     if (enc == ERTS_ATOM_ENC_UTF8) {
@@ -314,13 +314,13 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 	case ERTS_UTF8_OK_MAX_CHARS:
 	    /* Truncated... */
 	    if (!trunc)
-		return THE_NON_VALUE;
+		return ATOM_MAX_CHARS_ERROR;
 	    ASSERT(no_chars == MAX_ATOM_CHARACTERS);
 	    tlen = err_pos - text;
 	    break;
 	default:
 	    /* Bad utf8... */
-	    return THE_NON_VALUE;
+	    return ATOM_BAD_ENCODING_ERROR;
 	}
     }
 
@@ -333,7 +333,20 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
     atom_write_lock();
     aix = index_put(&erts_atom_table, (void*) &a);
     atom_write_unlock();
-    return make_atom(aix);
+    return aix;
+}
+
+/*
+ * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ */
+Eterm
+erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
+{
+    int aix = erts_atom_put_index(name, len, enc, trunc);
+    if (aix >= 0)
+	return make_atom(aix);
+    else
+	return THE_NON_VALUE;
 }
 
 Eterm

diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h
@@ -29,6 +29,8 @@
 #define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */
 #define ATOM_LIMIT (1024*1024)
 #define MIN_ATOM_TABLE_SIZE 8192
+#define ATOM_BAD_ENCODING_ERROR -1
+#define ATOM_MAX_CHARS_ERROR -2
 
 #ifndef ARCH_32
 /* Internal atom cache needs MAX_ATOM_TABLE_SIZE to be less than an
@@ -133,6 +135,7 @@ int atom_table_sz(void);	/* table size in bytes, excluding stored objects */
 
 Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */
 Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
+int erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
 void init_atom_table(void);
 void atom_info(fmtfn_t, void *);
 void dump_atoms(fmtfn_t, void *);

diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c
@@ -157,13 +157,15 @@ typedef struct {
 #define STR_CHUNK 2
 #define IMP_CHUNK 3
 #define EXP_CHUNK 4
-#define NUM_MANDATORY 5
+#define MIN_MANDATORY 1
+#define MAX_MANDATORY 5
 
 #define LAMBDA_CHUNK 5
 #define LITERAL_CHUNK 6
 #define ATTR_CHUNK 7
 #define COMPILE_CHUNK 8
 #define LINE_CHUNK 9
+#define UTF8_ATOM_CHUNK 10
 
 #define NUM_CHUNK_TYPES (sizeof(chunk_types)/sizeof(chunk_types[0]))
 
@@ -173,9 +175,13 @@ typedef struct {
 
 static Uint chunk_types[] = {
     /*
-     * Mandatory chunk types -- these MUST be present.
+     * Atom chunk types -- Atom or AtU8 MUST be present.
      */
     MakeIffId('A', 't', 'o', 'm'), /* 0 */
+
+    /*
+     * Mandatory chunk types -- these MUST be present.
+     */
     MakeIffId('C', 'o', 'd', 'e'), /* 1 */
     MakeIffId('S', 't', 'r', 'T'), /* 2 */
     MakeIffId('I', 'm', 'p', 'T'), /* 3 */
@@ -189,6 +195,7 @@ static Uint chunk_types[] = {
     MakeIffId('A', 't', 't', 'r'), /* 7 */
     MakeIffId('C', 'I', 'n', 'f'), /* 8 */
     MakeIffId('L', 'i', 'n', 'e'), /* 9 */
+    MakeIffId('A', 't', 'U', '8'), /* 10 */
 };
 
 /*
@@ -490,9 +497,9 @@ static Eterm stub_insert_new_code(Process *c_p, ErtsProcLocks c_p_locks,
 #endif
 static int init_iff_file(LoaderState* stp, byte* code, Uint size);
 static int scan_iff_file(LoaderState* stp, Uint* chunk_types,
-			 Uint num_types, Uint num_mandatory);
+			 Uint num_types);
 static int verify_chunks(LoaderState* stp);
-static int load_atom_table(LoaderState* stp);
+static int load_atom_table(LoaderState* stp, ErtsAtomEncoding enc);
 static int load_import_table(LoaderState* stp);
 static int read_export_table(LoaderState* stp);
 static int is_bif(Eterm mod, Eterm func, unsigned arity);
@@ -629,7 +636,7 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
     CHKALLOC();
     CHKBLK(ERTS_ALC_T_CODE,stp->code);
     if (!init_iff_file(stp, code, unloaded_size) ||
-	!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
+	!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
 	!verify_chunks(stp)) {
 	goto load_error;
     }
@@ -674,9 +681,16 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
      */
 
     CHKBLK(ERTS_ALC_T_CODE,stp->code);
-    define_file(stp, "atom table", ATOM_CHUNK);
-    if (!load_atom_table(stp)) {
-	goto load_error;
+    if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
+        define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
+        if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
+            goto load_error;
+        }
+    } else {
+        define_file(stp, "atom table", ATOM_CHUNK);
+        if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
+            goto load_error;
+        }
     }
 
     /*
@@ -1212,7 +1226,7 @@ init_iff_file(LoaderState* stp, byte* code, Uint size)
  * Scan the IFF file. The header should have been verified by init_iff_file().
  */
 static int
-scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types, Uint num_mandatory)
+scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types)
 {
     Uint count;
     Uint id;
@@ -1291,7 +1305,16 @@ verify_chunks(LoaderState* stp)
     MD5_CTX context;
 
     MD5Init(&context);
-    for (i = 0; i < NUM_MANDATORY; i++) {
+
+    if (stp->chunks[UTF8_ATOM_CHUNK].start != NULL) {
+	MD5Update(&context, stp->chunks[UTF8_ATOM_CHUNK].start, stp->chunks[UTF8_ATOM_CHUNK].size);
+    } else if (stp->chunks[ATOM_CHUNK].start != NULL) {
+	MD5Update(&context, stp->chunks[ATOM_CHUNK].start, stp->chunks[ATOM_CHUNK].size);
+    } else {
+        LoadError0(stp, "mandatory chunk of type 'Atom' or 'AtU8' not found\n");
+    }
+
+    for (i = MIN_MANDATORY; i < MAX_MANDATORY; i++) {
 	if (stp->chunks[i].start != NULL) {
 	    MD5Update(&context, stp->chunks[i].start, stp->chunks[i].size);
 	} else {
@@ -1352,7 +1375,7 @@ verify_chunks(LoaderState* stp)
 }
 
 static int
-load_atom_table(LoaderState* stp)
+load_atom_table(LoaderState* stp, ErtsAtomEncoding enc)
 {
     unsigned int i;
 
@@ -1371,7 +1394,7 @@ load_atom_table(LoaderState* stp)
 
 	GetByte(stp, n);
 	GetString(stp, atom, n);
-	stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1);
+	stp->atom[i] = erts_atom_put(atom, n, enc, 1);
     }
 
     /*
@@ -5937,7 +5960,7 @@ code_get_chunk_2(BIF_ALIST_2)
 	goto error;
     }
     if (!init_iff_file(stp, start, binary_size(Bin)) ||
-	!scan_iff_file(stp, &chunk, 1, 1) ||
+	!scan_iff_file(stp, &chunk, 1) ||
 	stp->chunks[0].start == NULL) {
 	res = am_undefined;
 	goto done;
@@ -5986,7 +6009,7 @@ code_module_md5_1(BIF_ALIST_1)
     }
     stp->module = THE_NON_VALUE; /* Suppress diagnostiscs */
     if (!init_iff_file(stp, bytes, binary_size(Bin)) ||
-	!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
+	!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
 	!verify_chunks(stp)) {
 	res = am_undefined;
 	goto done;
@@ -6335,17 +6358,24 @@ erts_make_stub_module(Process* p, Eterm hipe_magic_bin, Eterm Beam, Eterm Info)
     if (!init_iff_file(stp, bytes, size)) {
 	goto error;
     }
-    if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
+    if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
 	!verify_chunks(stp)) {
 	goto error;
     }
     define_file(stp, "code chunk header", CODE_CHUNK);
     if (!read_code_header(stp)) {
 	goto error;
     }
-    define_file(stp, "atom table", ATOM_CHUNK);
-    if (!load_atom_table(stp)) {
-	goto error;
+    if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
+        define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
+        if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
+            goto error;
+        }
+    } else {
+        define_file(stp, "atom table", ATOM_CHUNK);
+        if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
+            goto error;
+        }
     }
     define_file(stp, "export table", EXP_CHUNK);
     if (!stub_read_export_table(stp)) {