From e426d18e6de71ff390cd7917f96516ffae3a551b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Valim?= Date: Tue, 31 May 2016 14:28:54 +0200 Subject: [PATCH] Add new AtU8 beam chunk The new chunk stores atoms encoded in UTF-8. beam_lib has also been modified to handle the new 'utf8_atoms' attribute while the 'atoms' attribute may be a missing chunk from now on. The binary_to_atom/2 BIF can now encode any utf8 binary with up to 255 characters. --- erts/emulator/beam/atom.c | 35 +++++++++---- erts/emulator/beam/atom.h | 3 ++ erts/emulator/beam/beam_load.c | 66 +++++++++++++++++------- erts/emulator/beam/erl_unicode.c | 83 ++++++++++++------------------ erts/emulator/test/bif_SUITE.erl | 11 ++-- erts/emulator/test/code_SUITE.erl | 10 ++-- lib/compiler/src/beam_asm.erl | 27 ++++++---- lib/compiler/src/beam_dict.erl | 12 ++--- lib/compiler/src/beam_disasm.erl | 4 +- lib/compiler/src/compile.erl | 12 +++-- lib/compiler/test/lc_SUITE.erl | 2 +- lib/stdlib/src/beam_lib.erl | 52 ++++++++++++------- lib/stdlib/test/beam_lib_SUITE.erl | 15 +++--- 13 files changed, 197 insertions(+), 135 deletions(-) diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c index a5e778e4aa4e..6a9230b48848 100644 --- a/erts/emulator/beam/atom.c +++ b/erts/emulator/beam/atom.c @@ -233,10 +233,10 @@ static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp) } /* - * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned! + * erts_atom_put_index() may fail. Returns negative indexes for errors. */ -Eterm -erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) +int +erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc) { byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; const byte *text = name; @@ -253,7 +253,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) if (trunc) tlen = 0; else - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; } switch (enc) { @@ -262,7 +262,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) if (trunc) tlen = MAX_ATOM_CHARACTERS; else - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; } #ifdef DEBUG for (aix = 0; aix < len; aix++) { @@ -276,7 +276,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) if (trunc) tlen = MAX_ATOM_CHARACTERS; else - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; } no_latin1_chars = tlen; latin1_to_utf8(utf8_copy, &text, &tlen); @@ -284,7 +284,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) case ERTS_ATOM_ENC_UTF8: /* First sanity check; need to verify later */ if (tlen > MAX_ATOM_SZ_LIMIT && !trunc) - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; break; } @@ -295,7 +295,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) atom_read_unlock(); if (aix >= 0) { /* Already in table no need to verify it */ - return make_atom(aix); + return aix; } if (enc == ERTS_ATOM_ENC_UTF8) { @@ -314,13 +314,13 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) case ERTS_UTF8_OK_MAX_CHARS: /* Truncated... */ if (!trunc) - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; ASSERT(no_chars == MAX_ATOM_CHARACTERS); tlen = err_pos - text; break; default: /* Bad utf8... */ - return THE_NON_VALUE; + return ATOM_BAD_ENCODING_ERROR; } } @@ -333,7 +333,20 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) atom_write_lock(); aix = index_put(&erts_atom_table, (void*) &a); atom_write_unlock(); - return make_atom(aix); + return aix; +} + +/* + * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned! + */ +Eterm +erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) +{ + int aix = erts_atom_put_index(name, len, enc, trunc); + if (aix >= 0) + return make_atom(aix); + else + return THE_NON_VALUE; } Eterm diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h index ae60904785cf..e4b2ebe0bb6e 100644 --- a/erts/emulator/beam/atom.h +++ b/erts/emulator/beam/atom.h @@ -29,6 +29,8 @@ #define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */ #define ATOM_LIMIT (1024*1024) #define MIN_ATOM_TABLE_SIZE 8192 +#define ATOM_BAD_ENCODING_ERROR -1 +#define ATOM_MAX_CHARS_ERROR -2 #ifndef ARCH_32 /* Internal atom cache needs MAX_ATOM_TABLE_SIZE to be less than an @@ -133,6 +135,7 @@ int atom_table_sz(void); /* table size in bytes, excluding stored objects */ Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */ Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc); +int erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc); int atom_erase(byte*, int); int atom_static_put(byte*, int); void init_atom_table(void); diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c index d69b18e22f6e..195a36c3e918 100644 --- a/erts/emulator/beam/beam_load.c +++ b/erts/emulator/beam/beam_load.c @@ -155,13 +155,15 @@ typedef struct { #define STR_CHUNK 2 #define IMP_CHUNK 3 #define EXP_CHUNK 4 -#define NUM_MANDATORY 5 +#define MIN_MANDATORY 1 +#define MAX_MANDATORY 5 #define LAMBDA_CHUNK 5 #define LITERAL_CHUNK 6 #define ATTR_CHUNK 7 #define COMPILE_CHUNK 8 #define LINE_CHUNK 9 +#define UTF8_ATOM_CHUNK 10 #define NUM_CHUNK_TYPES (sizeof(chunk_types)/sizeof(chunk_types[0])) @@ -171,9 +173,13 @@ typedef struct { static Uint chunk_types[] = { /* - * Mandatory chunk types -- these MUST be present. + * Atom chunk types -- Atom or AtU8 MUST be present. */ MakeIffId('A', 't', 'o', 'm'), /* 0 */ + + /* + * Mandatory chunk types -- these MUST be present. + */ MakeIffId('C', 'o', 'd', 'e'), /* 1 */ MakeIffId('S', 't', 'r', 'T'), /* 2 */ MakeIffId('I', 'm', 'p', 'T'), /* 3 */ @@ -187,6 +193,7 @@ static Uint chunk_types[] = { MakeIffId('A', 't', 't', 'r'), /* 7 */ MakeIffId('C', 'I', 'n', 'f'), /* 8 */ MakeIffId('L', 'i', 'n', 'e'), /* 9 */ + MakeIffId('A', 't', 'U', '8'), /* 10 */ }; /* @@ -485,9 +492,9 @@ static Eterm stub_insert_new_code(Process *c_p, ErtsProcLocks c_p_locks, BeamCodeHeader* code, Uint size); static int init_iff_file(LoaderState* stp, byte* code, Uint size); static int scan_iff_file(LoaderState* stp, Uint* chunk_types, - Uint num_types, Uint num_mandatory); + Uint num_types); static int verify_chunks(LoaderState* stp); -static int load_atom_table(LoaderState* stp); +static int load_atom_table(LoaderState* stp, ErtsAtomEncoding enc); static int load_import_table(LoaderState* stp); static int read_export_table(LoaderState* stp); static int is_bif(Eterm mod, Eterm func, unsigned arity); @@ -626,7 +633,7 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader, CHKALLOC(); CHKBLK(ERTS_ALC_T_CODE,stp->code); if (!init_iff_file(stp, code, unloaded_size) || - !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) || + !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) || !verify_chunks(stp)) { goto load_error; } @@ -671,9 +678,16 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader, */ CHKBLK(ERTS_ALC_T_CODE,stp->code); - define_file(stp, "atom table", ATOM_CHUNK); - if (!load_atom_table(stp)) { - goto load_error; + if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) { + define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) { + goto load_error; + } + } else { + define_file(stp, "atom table", ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) { + goto load_error; + } } /* @@ -1198,7 +1212,7 @@ init_iff_file(LoaderState* stp, byte* code, Uint size) * Scan the IFF file. The header should have been verified by init_iff_file(). */ static int -scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types, Uint num_mandatory) +scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types) { Uint count; Uint id; @@ -1277,7 +1291,16 @@ verify_chunks(LoaderState* stp) MD5_CTX context; MD5Init(&context); - for (i = 0; i < NUM_MANDATORY; i++) { + + if (stp->chunks[UTF8_ATOM_CHUNK].start != NULL) { + MD5Update(&context, stp->chunks[UTF8_ATOM_CHUNK].start, stp->chunks[UTF8_ATOM_CHUNK].size); + } else if (stp->chunks[ATOM_CHUNK].start != NULL) { + MD5Update(&context, stp->chunks[ATOM_CHUNK].start, stp->chunks[ATOM_CHUNK].size); + } else { + LoadError0(stp, "mandatory chunk of type 'Atom' or 'AtU8' not found\n"); + } + + for (i = MIN_MANDATORY; i < MAX_MANDATORY; i++) { if (stp->chunks[i].start != NULL) { MD5Update(&context, stp->chunks[i].start, stp->chunks[i].size); } else { @@ -1338,7 +1361,7 @@ verify_chunks(LoaderState* stp) } static int -load_atom_table(LoaderState* stp) +load_atom_table(LoaderState* stp, ErtsAtomEncoding enc) { unsigned int i; @@ -1357,7 +1380,7 @@ load_atom_table(LoaderState* stp) GetByte(stp, n); GetString(stp, atom, n); - stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1); + stp->atom[i] = erts_atom_put(atom, n, enc, 1); } /* @@ -5922,7 +5945,7 @@ code_get_chunk_2(BIF_ALIST_2) goto error; } if (!init_iff_file(stp, start, binary_size(Bin)) || - !scan_iff_file(stp, &chunk, 1, 1) || + !scan_iff_file(stp, &chunk, 1) || stp->chunks[0].start == NULL) { res = am_undefined; goto done; @@ -5971,7 +5994,7 @@ code_module_md5_1(BIF_ALIST_1) } stp->module = THE_NON_VALUE; /* Suppress diagnostiscs */ if (!init_iff_file(stp, bytes, binary_size(Bin)) || - !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) || + !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) || !verify_chunks(stp)) { res = am_undefined; goto done; @@ -6324,7 +6347,7 @@ erts_make_stub_module(Process* p, Eterm Mod, Eterm Beam, Eterm Info) if (!init_iff_file(stp, bytes, size)) { goto error; } - if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) || + if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) || !verify_chunks(stp)) { goto error; } @@ -6332,9 +6355,16 @@ erts_make_stub_module(Process* p, Eterm Mod, Eterm Beam, Eterm Info) if (!read_code_header(stp)) { goto error; } - define_file(stp, "atom table", ATOM_CHUNK); - if (!load_atom_table(stp)) { - goto error; + if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) { + define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) { + goto error; + } + } else { + define_file(stp, "atom table", ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) { + goto error; + } } define_file(stp, "export table", EXP_CHUNK); if (!stub_read_export_table(stp)) { diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index bd5e1482fb85..79842dec899e 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1895,69 +1895,52 @@ binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist) BIF_ERROR(proc, BADARG); } bin_size = binary_size(bin); + Eterm a; + if (enc == am_latin1) { - Eterm a; - if (bin_size > MAX_ATOM_CHARACTERS) { - system_limit: - erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(proc, SYSTEM_LIMIT); - } if (!must_exist) { - a = erts_atom_put((byte *) bytes, - bin_size, - ERTS_ATOM_ENC_LATIN1, - 0); - erts_free_aligned_binary_bytes(temp_alloc); - if (is_non_value(a)) - goto badarg; - BIF_RET(a); - } else if (erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) { - erts_free_aligned_binary_bytes(temp_alloc); - BIF_RET(a); - } else { + int lix = erts_atom_put_index((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_LATIN1, + 0); + if (lix == ATOM_BAD_ENCODING_ERROR) { + badarg: + erts_free_aligned_binary_bytes(temp_alloc); + BIF_ERROR(proc, BADARG); + } else if (lix == ATOM_MAX_CHARS_ERROR) { + system_limit: + erts_free_aligned_binary_bytes(temp_alloc); + BIF_ERROR(proc, SYSTEM_LIMIT); + } + + a = make_atom(lix); + } else if (!erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) { goto badarg; } - } else if (enc == am_utf8 || enc == am_unicode) { - Eterm res; - Uint num_chars = 0; - const byte* p = bytes; - Uint left = bin_size; - while (left) { - if (++num_chars > MAX_ATOM_CHARACTERS) { + } else if (enc == am_utf8 || enc == am_unicode) { + if (!must_exist) { + int uix = erts_atom_put_index((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_UTF8, + 0); + if (uix == ATOM_BAD_ENCODING_ERROR) { + goto badarg; + } else if (uix == ATOM_MAX_CHARS_ERROR) { goto system_limit; } - if ((p[0] & 0x80) == 0) { - ++p; - --left; - } - else if (left >= 2 - && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */ - && (p[1] & 0xC0) == 0x80) { - p += 2; - left -= 2; - } - else goto badarg; - } - if (!must_exist) { - res = erts_atom_put((byte *) bytes, - bin_size, - ERTS_ATOM_ENC_UTF8, - 0); + a = make_atom(uix); } - else if (!erts_atom_get((char*)bytes, bin_size, &res, ERTS_ATOM_ENC_UTF8)) { + else if (!erts_atom_get((char*)bytes, bin_size, &a, ERTS_ATOM_ENC_UTF8)) { goto badarg; } - erts_free_aligned_binary_bytes(temp_alloc); - if (is_non_value(res)) - goto badarg; - BIF_RET(res); } else { - badarg: - erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(proc, BADARG); + goto badarg; } + + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(a); } BIF_RETTYPE binary_to_atom_2(BIF_ALIST_2) diff --git a/erts/emulator/test/bif_SUITE.erl b/erts/emulator/test/bif_SUITE.erl index d31399e4af04..0d95243d229e 100644 --- a/erts/emulator/test/bif_SUITE.erl +++ b/erts/emulator/test/bif_SUITE.erl @@ -426,6 +426,8 @@ binary_to_atom(Config) when is_list(Config) -> Long = lists:seq(0, 254), LongAtom = list_to_atom(Long), LongBin = list_to_binary(Long), + UnicodeLongAtom = list_to_atom([$é || _ <- lists:seq(0, 254)]), + UnicodeLongBin = << <<"é"/utf8>> || _ <- lists:seq(0, 254)>>, %% latin1 '' = test_binary_to_atom(<<>>, latin1), @@ -437,12 +439,17 @@ binary_to_atom(Config) when is_list(Config) -> '' = test_binary_to_atom(<<>>, utf8), HalfLongAtom = test_binary_to_atom(HalfLongBin, utf8), HalfLongAtom = test_binary_to_atom(HalfLongBin, unicode), + UnicodeLongAtom = test_binary_to_atom(UnicodeLongBin, utf8), + UnicodeLongAtom = test_binary_to_atom(UnicodeLongBin, unicode), [] = [C || C <- lists:seq(128, 255), begin list_to_atom([C]) =/= test_binary_to_atom(<>, utf8) end], + <<"こんにちは"/utf8>> = + atom_to_binary(test_binary_to_atom(<<"こんにちは"/utf8>>, utf8), utf8), + %% badarg failures. fail_binary_to_atom(atom), fail_binary_to_atom(42), @@ -461,10 +468,6 @@ binary_to_atom(Config) when is_list(Config) -> ?BADARG(binary_to_atom(id(<<255>>), utf8)), ?BADARG(binary_to_atom(id(<<255,0>>), utf8)), ?BADARG(binary_to_atom(id(<<16#C0,16#80>>), utf8)), %Overlong 0. - [?BADARG(binary_to_atom(<>, utf8)) || C <- lists:seq(256, 16#D7FF)], - [?BADARG(binary_to_atom(<>, utf8)) || C <- lists:seq(16#E000, 16#FFFD)], - [?BADARG(binary_to_atom(<>, utf8)) || C <- lists:seq(16#10000, 16#8FFFF)], - [?BADARG(binary_to_atom(<>, utf8)) || C <- lists:seq(16#90000, 16#10FFFF)], %% system_limit failures. ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)), diff --git a/erts/emulator/test/code_SUITE.erl b/erts/emulator/test/code_SUITE.erl index 8427bb134dd7..3b58028b2544 100644 --- a/erts/emulator/test/code_SUITE.erl +++ b/erts/emulator/test/code_SUITE.erl @@ -400,16 +400,16 @@ get_chunk(Config) when is_list(Config) -> {ok,my_code_test,Code} = compile:file(File, [binary]), %% Should work. - Chunk = get_chunk_ok("Atom", Code), - Chunk = get_chunk_ok("Atom", make_sub_binary(Code)), - Chunk = get_chunk_ok("Atom", make_unaligned_sub_binary(Code)), + Chunk = get_chunk_ok("AtU8", Code), + Chunk = get_chunk_ok("AtU8", make_sub_binary(Code)), + Chunk = get_chunk_ok("AtU8", make_unaligned_sub_binary(Code)), %% Should fail. - {'EXIT',{badarg,_}} = (catch code:get_chunk(bit_sized_binary(Code), "Atom")), + {'EXIT',{badarg,_}} = (catch code:get_chunk(bit_sized_binary(Code), "AtU8")), {'EXIT',{badarg,_}} = (catch code:get_chunk(Code, "bad chunk id")), %% Invalid beam code or missing chunk should return 'undefined'. - undefined = code:get_chunk(<<"not a beam module">>, "Atom"), + undefined = code:get_chunk(<<"not a beam module">>, "AtU8"), undefined = code:get_chunk(Code, "XXXX"), ok. diff --git a/lib/compiler/src/beam_asm.erl b/lib/compiler/src/beam_asm.erl index f6ca7a0afba3..6c44b6a81746 100644 --- a/lib/compiler/src/beam_asm.erl +++ b/lib/compiler/src/beam_asm.erl @@ -21,23 +21,23 @@ -module(beam_asm). --export([module/4]). +-export([module/5]). -export([encode/2]). -import(lists, [map/2,member/2,keymember/3,duplicate/2,splitwith/2]). -include("beam_opcodes.hrl"). -module(Code, Abst, SourceFile, Opts) -> - {ok,assemble(Code, Abst, SourceFile, Opts)}. +module(Code, Abst, SourceFile, Opts, CompilerOpts) -> + {ok,assemble(Code, Abst, SourceFile, Opts, CompilerOpts)}. -assemble({Mod,Exp0,Attr0,Asm0,NumLabels}, Abst, SourceFile, Opts) -> +assemble({Mod,Exp0,Attr0,Asm0,NumLabels}, Abst, SourceFile, Opts, CompilerOpts) -> {1,Dict0} = beam_dict:atom(Mod, beam_dict:new()), {0,Dict1} = beam_dict:fname(atom_to_list(Mod) ++ ".erl", Dict0), NumFuncs = length(Asm0), {Asm,Attr} = on_load(Asm0, Attr0), Exp = cerl_sets:from_list(Exp0), {Code,Dict2} = assemble_1(Asm, Exp, Dict1, []), - build_file(Code, Attr, Dict2, NumLabels, NumFuncs, Abst, SourceFile, Opts). + build_file(Code, Attr, Dict2, NumLabels, NumFuncs, Abst, SourceFile, Opts, CompilerOpts). on_load(Fs0, Attr0) -> case proplists:get_value(on_load, Attr0) of @@ -80,7 +80,7 @@ assemble_function([H|T], Acc, Dict0) -> assemble_function([], Code, Dict) -> {Code, Dict}. -build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) -> +build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts, CompilerOpts) -> %% Create the code chunk. CodeChunk = chunk(<<"Code">>, @@ -92,9 +92,9 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) -> Code), %% Create the atom table chunk. - - {NumAtoms, AtomTab} = beam_dict:atom_table(Dict), - AtomChunk = chunk(<<"Atom">>, <>, AtomTab), + AtomEncoding = atom_encoding(CompilerOpts), + {NumAtoms, AtomTab} = beam_dict:atom_table(Dict, AtomEncoding), + AtomChunk = chunk(atom_chunk_name(AtomEncoding), <>, AtomTab), %% Create the import table chunk. @@ -170,6 +170,15 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) -> end, build_form(<<"BEAM">>, Chunks). +atom_encoding(Opts) -> + case proplists:get_bool(no_utf8_atoms, Opts) of + false -> utf8; + true -> latin1 + end. + +atom_chunk_name(utf8) -> <<"AtU8">>; +atom_chunk_name(latin1) -> <<"Atom">>. + %% finalize_fun_table(Essentials, MD5) -> FinalizedEssentials %% Update the 'old_uniq' field in the entry for each fun in the %% 'FunT' chunk. We'll use part of the MD5 for the module as a diff --git a/lib/compiler/src/beam_dict.erl b/lib/compiler/src/beam_dict.erl index 9565ab74c44c..93cb74f55ef1 100644 --- a/lib/compiler/src/beam_dict.erl +++ b/lib/compiler/src/beam_dict.erl @@ -24,7 +24,7 @@ -export([new/0,opcode/2,highest_opcode/1, atom/2,local/4,export/4,import/4, string/2,lambda/3,literal/2,line/2,fname/2, - atom_table/1,local_table/1,export_table/1,import_table/1, + atom_table/2,local_table/1,export_table/1,import_table/1, string_table/1,lambda_table/1,literal_table/1, line_table/1]). @@ -194,15 +194,15 @@ fname(Name, #asm{fnames=Fnames}=Dict) -> end. %% Returns the atom table. -%% atom_table(Dict) -> {LastIndex,[Length,AtomString...]} --spec atom_table(bdict()) -> {non_neg_integer(), [[non_neg_integer(),...]]}. +%% atom_table(Dict, Encoding) -> {LastIndex,[Length,AtomString...]} +-spec atom_table(bdict(), latin1 | utf8) -> {non_neg_integer(), [[non_neg_integer(),...]]}. -atom_table(#asm{atoms=Atoms}) -> +atom_table(#asm{atoms=Atoms}, Encoding) -> NumAtoms = maps:size(Atoms), Sorted = lists:keysort(2, maps:to_list(Atoms)), {NumAtoms,[begin - L = atom_to_list(A), - [length(L)|L] + L = atom_to_binary(A, Encoding), + [byte_size(L)|L] end || {A,_} <- Sorted]}. %% Returns the table of local functions. diff --git a/lib/compiler/src/beam_disasm.erl b/lib/compiler/src/beam_disasm.erl index c699672db151..458a215ff144 100644 --- a/lib/compiler/src/beam_disasm.erl +++ b/lib/compiler/src/beam_disasm.erl @@ -172,10 +172,10 @@ file(File) -> %%----------------------------------------------------------------------- process_chunks(F) -> - case beam_lib:chunks(F, [atoms,"Code","StrT", + case beam_lib:chunks(F, [utf8_atoms,"Code","StrT", indexed_imports,labeled_exports]) of {ok,{Module, - [{atoms,AtomsList},{"Code",CodeBin},{"StrT",StrBin}, + [{utf8_atoms,AtomsList},{"Code",CodeBin},{"StrT",StrBin}, {indexed_imports,ImportsList},{labeled_exports,Exports}]}} -> Atoms = mk_atoms(AtomsList), LambdaBin = optional_chunk(F, "FunT"), diff --git a/lib/compiler/src/compile.erl b/lib/compiler/src/compile.erl index 16621d9b43f4..ef54396fef7a 100644 --- a/lib/compiler/src/compile.erl +++ b/lib/compiler/src/compile.erl @@ -206,11 +206,13 @@ expand_opt(report, Os) -> expand_opt(return, Os) -> [return_errors,return_warnings|Os]; expand_opt(r12, Os) -> - [no_recv_opt,no_line_info|Os]; + [no_recv_opt,no_line_info,no_utf8_atoms|Os]; expand_opt(r13, Os) -> - [no_recv_opt,no_line_info|Os]; + [no_recv_opt,no_line_info,no_utf8_atoms|Os]; expand_opt(r14, Os) -> - [no_line_info|Os]; + [no_line_info,no_utf8_atoms|Os]; +expand_opt(r19, Os) -> + [no_utf8_atoms|Os]; expand_opt({debug_info_key,_}=O, Os) -> [encrypt_debug_info,O|Os]; expand_opt(no_float_opt, Os) -> @@ -1345,14 +1347,14 @@ encrypt({des3_cbc=Type,Key,IVec,BlockSize}, Bin0) -> save_core_code(St) -> {ok,St#compile{core_code=cerl:from_records(St#compile.code)}}. -beam_asm(#compile{ifile=File,code=Code0, +beam_asm(#compile{ifile=File,code=Code0,options=CompilerOpts, abstract_code=Abst,mod_options=Opts0}=St) -> Source = paranoid_absname(File), Opts1 = lists:map(fun({debug_info_key,_}) -> {debug_info_key,'********'}; (Other) -> Other end, Opts0), Opts2 = [O || O <- Opts1, effects_code_generation(O)], - case beam_asm:module(Code0, Abst, Source, Opts2) of + case beam_asm:module(Code0, Abst, Source, Opts2, CompilerOpts) of {ok,Code} -> {ok,St#compile{code=Code,abstract_code=[]}} end. diff --git a/lib/compiler/test/lc_SUITE.erl b/lib/compiler/test/lc_SUITE.erl index 3cb49433ce2b..6904ee7f5283 100644 --- a/lib/compiler/test/lc_SUITE.erl +++ b/lib/compiler/test/lc_SUITE.erl @@ -226,7 +226,7 @@ effect(Config) when is_list(Config) -> lc_SUITE -> _ = [{'EXIT',{badarg,_}} = (catch binary_to_atom(<>, utf8)) || - C <- lists:seq(16#10000, 16#FFFFF)]; + C <- lists:seq(16#FF10000, 16#FFFFFFF)]; _ -> ok end, diff --git a/lib/stdlib/src/beam_lib.erl b/lib/stdlib/src/beam_lib.erl index d7ee5c1f5d4a..f312e5d6bb19 100644 --- a/lib/stdlib/src/beam_lib.erl +++ b/lib/stdlib/src/beam_lib.erl @@ -63,12 +63,12 @@ -type label() :: integer(). -type chunkid() :: nonempty_string(). % approximation of the strings below -%% "Abst" | "Attr" | "CInf" | "ExpT" | "ImpT" | "LocT" | "Atom". +%% "Abst" | "Attr" | "CInf" | "ExpT" | "ImpT" | "LocT" | "Atom" | "AtU8". -type chunkname() :: 'abstract_code' | 'attributes' | 'compile_info' | 'exports' | 'labeled_exports' | 'imports' | 'indexed_imports' | 'locals' | 'labeled_locals' - | 'atoms'. + | 'atoms' | 'utf8_atoms'. -type chunkref() :: chunkname() | chunkid(). -type attrib_entry() :: {Attribute :: atom(), [AttributeValue :: term()]}. @@ -85,7 +85,8 @@ | {'indexed_imports', [{index(), module(), Function :: atom(), arity()}]} | {'locals', [{atom(), arity()}]} | {'labeled_locals', [labeled_entry()]} - | {'atoms', [{integer(), atom()}]}. + | {'atoms', [{integer(), atom()}]} + | {'utf8_atoms', [{integer(), atom()}]}. %% Error reasons -type info_rsn() :: {'chunk_too_big', file:filename(), @@ -581,18 +582,23 @@ scan_beam(FD, Pos, What, Mod, Data) -> error({invalid_beam_file, filename(FD), Pos}) end. -get_data(Cs, "Atom"=Id, FD, Size, Pos, Pos2, _Mod, Data) -> +get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, Encoding) -> NewCs = del_chunk(Id, Cs), {NFD, Chunk} = get_chunk(Id, Pos, Size, FD), <<_Num:32, Chunk2/binary>> = Chunk, - {Module, _} = extract_atom(Chunk2), + {Module, _} = extract_atom(Chunk2, Encoding), C = case Cs of info -> {Id, Pos, Size}; _ -> {Id, Chunk} end, - scan_beam(NFD, Pos2, NewCs, Module, [C | Data]); + scan_beam(NFD, Pos2, NewCs, Module, [C | Data]). + +get_data(Cs, "Atom" = Id, FD, Size, Pos, Pos2, _Mod, Data) -> + get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, latin1); +get_data(Cs, "AtU8" = Id, FD, Size, Pos, Pos2, _Mod, Data) -> + get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, utf8); get_data(info, Id, FD, Size, Pos, Pos2, Mod, Data) -> scan_beam(FD, Pos2, info, Mod, [{Id, Pos, Size} | Data]); get_data(Chunks, Id, FD, Size, Pos, Pos2, Mod, Data) -> @@ -671,6 +677,10 @@ chunk_to_data(atoms=Id, _Chunk, _File, Cs, AtomTable0, _Mod) -> AtomTable = ensure_atoms(AtomTable0, Cs), Atoms = ets:tab2list(AtomTable), {AtomTable, {Id, lists:sort(Atoms)}}; +chunk_to_data(utf8_atoms=Id, _Chunk, _File, Cs, AtomTable0, _Mod) -> + AtomTable = ensure_atoms(AtomTable0, Cs), + Atoms = ets:tab2list(AtomTable), + {AtomTable, {Id, lists:sort(Atoms)}}; chunk_to_data(ChunkName, Chunk, File, Cs, AtomTable, _Mod) when is_atom(ChunkName) -> case catch symbols(Chunk, AtomTable, Cs, ChunkName) of @@ -684,6 +694,7 @@ chunk_to_data(ChunkId, Chunk, _File, {AtomTable, {ChunkId, Chunk}}. % Chunk is a binary chunk_name_to_id(atoms, _) -> "Atom"; +chunk_name_to_id(utf8_atoms, _) -> "AtU8"; chunk_name_to_id(indexed_imports, _) -> "ImpT"; chunk_name_to_id(imports, _) -> "ImpT"; chunk_name_to_id(exports, _) -> "ExpT"; @@ -738,25 +749,30 @@ atm(AT, N) -> %% AT is updated. ensure_atoms({empty, AT}, Cs) -> - {_Id, AtomChunk} = lists:keyfind("Atom", 1, Cs), - extract_atoms(AtomChunk, AT), + case lists:keyfind("AtU8", 1, Cs) of + {_Id, AtomChunk} -> + extract_atoms(AtomChunk, AT, utf8); + false -> + {_Id, AtomChunk} = lists:keyfind("Atom", 1, Cs), + extract_atoms(AtomChunk, AT, latin1) + end, AT; ensure_atoms(AT, _Cs) -> AT. -extract_atoms(<<_Num:32, B/binary>>, AT) -> - extract_atoms(B, 1, AT). +extract_atoms(<<_Num:32, B/binary>>, AT, Encoding) -> + extract_atoms(B, 1, AT, Encoding). -extract_atoms(<<>>, _I, _AT) -> +extract_atoms(<<>>, _I, _AT, _Encoding) -> true; -extract_atoms(B, I, AT) -> - {Atom, B1} = extract_atom(B), +extract_atoms(B, I, AT, Encoding) -> + {Atom, B1} = extract_atom(B, Encoding), true = ets:insert(AT, {I, Atom}), - extract_atoms(B1, I+1, AT). + extract_atoms(B1, I+1, AT, Encoding). -extract_atom(<>) -> +extract_atom(<>, Encoding) -> <> = B, - {list_to_atom(binary_to_list(SB)), Tail}. + {binary_to_atom(SB, Encoding), Tail}. %%% Utils. @@ -856,12 +872,12 @@ significant_chunks() -> %% for a module. They are listed in the order that they should be MD5:ed. md5_chunks() -> - ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"]. + ["Atom", "AtU8", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"]. %% The following chunks are mandatory in every Beam file. mandatory_chunks() -> - ["Code", "ExpT", "ImpT", "StrT", "Atom"]. + ["Code", "ExpT", "ImpT", "StrT"]. %%% ==================================================================== %%% The rest of the file handles encrypted debug info. diff --git a/lib/stdlib/test/beam_lib_SUITE.erl b/lib/stdlib/test/beam_lib_SUITE.erl index 4521ecc0ef32..d8689ec40654 100644 --- a/lib/stdlib/test/beam_lib_SUITE.erl +++ b/lib/stdlib/test/beam_lib_SUITE.erl @@ -119,8 +119,10 @@ do_normal(BeamFile) -> {ok, {simple, [{attributes, [{vsn, [_]}]}, CompileInfo, Exports, Imports, Local]}} = beam_lib:chunks(BeamFile, [attributes, compile_info, exports, imports, locals]), - {ok, {simple, [{atoms, _Atoms}]}} = + {error, beam_lib, {missing_chunk, _, "Atom"}} = beam_lib:chunks(BeamFile, [atoms]), + {ok, {simple, [{utf8_atoms, _Atoms}]}} = + beam_lib:chunks(BeamFile, [utf8_atoms]), {ok, {simple, [{labeled_exports, _LExports}]}} = beam_lib:chunks(BeamFile, [labeled_exports]), {ok, {simple, [{labeled_locals, _LLocals}]}} = @@ -130,17 +132,18 @@ do_normal(BeamFile) -> beam_lib:chunks(BeamFile, [abstract_code]), %% Test reading optional chunks. - All = ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"], + All = ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT", "AtU8"], {ok,{simple,Chunks}} = beam_lib:chunks(BeamFile, All, [allow_missing_chunks]), verify_simple(Chunks). -verify_simple([{"Atom", AtomBin}, +verify_simple([{"Atom", missing_chunk}, {"Code", CodeBin}, {"StrT", StrBin}, {"ImpT", ImpBin}, {"ExpT", ExpBin}, {"FunT", missing_chunk}, - {"LitT", missing_chunk}]) + {"LitT", missing_chunk}, + {"AtU8", AtomBin}]) when is_binary(AtomBin), is_binary(CodeBin), is_binary(StrBin), is_binary(ImpBin), is_binary(ExpBin) -> ok. @@ -211,7 +214,7 @@ last_chunk(Bin) -> do_error(BeamFile, ACopy) -> %% evil tests Chunks = chunk_info(BeamFile), - {value, {_, AtomStart, _}} = lists:keysearch("Atom", 1, Chunks), + {value, {_, AtomStart, _}} = lists:keysearch("AtU8", 1, Chunks), {value, {_, ImportStart, _}} = lists:keysearch("ImpT", 1, Chunks), {value, {_, AbstractStart, _}} = lists:keysearch("Abst", 1, Chunks), {value, {_, AttributesStart, _}} = @@ -234,7 +237,7 @@ do_error(BeamFile, ACopy) -> verify(not_a_beam_file, beam_lib:info(BF7)), BF8 = set_byte(ACopy, BeamFile, 13, 17), - verify(missing_chunk, beam_lib:chunks(BF8, ["Atom"])), + verify(missing_chunk, beam_lib:chunks(BF8, ["AtU8"])), BF9 = set_byte(ACopy, BeamFile, CompileInfoStart+10, 17), verify(invalid_chunk, beam_lib:chunks(BF9, [compile_info])).