Skip to content

Commit

Permalink
Add new AtU8 beam chunk
Browse files Browse the repository at this point in the history
The new chunk stores atoms encoded in UTF-8.

beam_lib has also been modified to handle the new
'utf8_atoms' attribute while the 'atoms' attribute
may be a missing chunk from now on.

The binary_to_atom/2 BIF can now encode any utf8 binary
although it is currently limited to 255 bytes.
  • Loading branch information
José Valim committed Jun 1, 2016
1 parent 962b25f commit 8b5121e
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 87 deletions.
66 changes: 48 additions & 18 deletions erts/emulator/beam/beam_load.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,15 @@ typedef struct {
#define STR_CHUNK 2
#define IMP_CHUNK 3
#define EXP_CHUNK 4
#define NUM_MANDATORY 5
#define MIN_MANDATORY 1
#define MAX_MANDATORY 5

#define LAMBDA_CHUNK 5
#define LITERAL_CHUNK 6
#define ATTR_CHUNK 7
#define COMPILE_CHUNK 8
#define LINE_CHUNK 9
#define UTF8_ATOM_CHUNK 10

#define NUM_CHUNK_TYPES (sizeof(chunk_types)/sizeof(chunk_types[0]))

Expand All @@ -171,9 +173,13 @@ typedef struct {

static Uint chunk_types[] = {
/*
* Mandatory chunk types -- these MUST be present.
* Atom chunk types -- Atom or AtU8 MUST be present.
*/
MakeIffId('A', 't', 'o', 'm'), /* 0 */

/*
* Mandatory chunk types -- these MUST be present.
*/
MakeIffId('C', 'o', 'd', 'e'), /* 1 */
MakeIffId('S', 't', 'r', 'T'), /* 2 */
MakeIffId('I', 'm', 'p', 'T'), /* 3 */
Expand All @@ -187,6 +193,7 @@ static Uint chunk_types[] = {
MakeIffId('A', 't', 't', 'r'), /* 7 */
MakeIffId('C', 'I', 'n', 'f'), /* 8 */
MakeIffId('L', 'i', 'n', 'e'), /* 9 */
MakeIffId('A', 't', 'U', '8'), /* 10 */
};

/*
Expand Down Expand Up @@ -485,9 +492,9 @@ static Eterm stub_insert_new_code(Process *c_p, ErtsProcLocks c_p_locks,
BeamCodeHeader* code, Uint size);
static int init_iff_file(LoaderState* stp, byte* code, Uint size);
static int scan_iff_file(LoaderState* stp, Uint* chunk_types,
Uint num_types, Uint num_mandatory);
Uint num_types);
static int verify_chunks(LoaderState* stp);
static int load_atom_table(LoaderState* stp);
static int load_atom_table(LoaderState* stp, ErtsAtomEncoding enc);
static int load_import_table(LoaderState* stp);
static int read_export_table(LoaderState* stp);
static int is_bif(Eterm mod, Eterm func, unsigned arity);
Expand Down Expand Up @@ -626,7 +633,7 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
CHKALLOC();
CHKBLK(ERTS_ALC_T_CODE,stp->code);
if (!init_iff_file(stp, code, unloaded_size) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
goto load_error;
}
Expand Down Expand Up @@ -671,9 +678,16 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
*/

CHKBLK(ERTS_ALC_T_CODE,stp->code);
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp)) {
goto load_error;
if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
goto load_error;
}
} else {
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
goto load_error;
}
}

/*
Expand Down Expand Up @@ -1188,7 +1202,7 @@ init_iff_file(LoaderState* stp, byte* code, Uint size)
* Scan the IFF file. The header should have been verified by init_iff_file().
*/
static int
scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types, Uint num_mandatory)
scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types)
{
Uint count;
Uint id;
Expand Down Expand Up @@ -1267,7 +1281,16 @@ verify_chunks(LoaderState* stp)
MD5_CTX context;

MD5Init(&context);
for (i = 0; i < NUM_MANDATORY; i++) {

if (stp->chunks[UTF8_ATOM_CHUNK].start != NULL) {
MD5Update(&context, stp->chunks[UTF8_ATOM_CHUNK].start, stp->chunks[UTF8_ATOM_CHUNK].size);
} else if (stp->chunks[ATOM_CHUNK].start != NULL) {
MD5Update(&context, stp->chunks[ATOM_CHUNK].start, stp->chunks[ATOM_CHUNK].size);
} else {
LoadError0(stp, "mandatory chunk of type 'Atom' or 'AtU8' not found\n");
}

for (i = MIN_MANDATORY; i < MAX_MANDATORY; i++) {
if (stp->chunks[i].start != NULL) {
MD5Update(&context, stp->chunks[i].start, stp->chunks[i].size);
} else {
Expand Down Expand Up @@ -1328,7 +1351,7 @@ verify_chunks(LoaderState* stp)
}

static int
load_atom_table(LoaderState* stp)
load_atom_table(LoaderState* stp, ErtsAtomEncoding enc)
{
int i;

Expand All @@ -1347,7 +1370,7 @@ load_atom_table(LoaderState* stp)

GetByte(stp, n);
GetString(stp, atom, n);
stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1);
stp->atom[i] = erts_atom_put(atom, n, enc, 1);
}

/*
Expand Down Expand Up @@ -5921,7 +5944,7 @@ code_get_chunk_2(BIF_ALIST_2)
goto error;
}
if (!init_iff_file(stp, start, binary_size(Bin)) ||
!scan_iff_file(stp, &chunk, 1, 1) ||
!scan_iff_file(stp, &chunk, 1) ||
stp->chunks[0].start == NULL) {
res = am_undefined;
goto done;
Expand Down Expand Up @@ -5970,7 +5993,7 @@ code_module_md5_1(BIF_ALIST_1)
}
stp->module = THE_NON_VALUE; /* Suppress diagnostiscs */
if (!init_iff_file(stp, bytes, binary_size(Bin)) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
res = am_undefined;
goto done;
Expand Down Expand Up @@ -6323,17 +6346,24 @@ erts_make_stub_module(Process* p, Eterm Mod, Eterm Beam, Eterm Info)
if (!init_iff_file(stp, bytes, size)) {
goto error;
}
if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
goto error;
}
define_file(stp, "code chunk header", CODE_CHUNK);
if (!read_code_header(stp)) {
goto error;
}
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp)) {
goto error;
if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
goto error;
}
} else {
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
goto error;
}
}
define_file(stp, "export table", EXP_CHUNK);
if (!stub_read_export_table(stp)) {
Expand Down
33 changes: 7 additions & 26 deletions erts/emulator/beam/erl_unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1895,13 +1895,15 @@ binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist)
BIF_ERROR(proc, BADARG);
}
bin_size = binary_size(bin);

/* Limit of 255 latin characters or 255 unicode bytes */
if (bin_size > MAX_ATOM_CHARACTERS) {
erts_free_aligned_binary_bytes(temp_alloc);
BIF_ERROR(proc, SYSTEM_LIMIT);
}

if (enc == am_latin1) {
Eterm a;
if (bin_size > MAX_ATOM_CHARACTERS) {
system_limit:
erts_free_aligned_binary_bytes(temp_alloc);
BIF_ERROR(proc, SYSTEM_LIMIT);
}
if (!must_exist) {
a = erts_atom_put((byte *) bytes,
bin_size,
Expand All @@ -1919,27 +1921,6 @@ binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist)
}
} else if (enc == am_utf8 || enc == am_unicode) {
Eterm res;
Uint num_chars = 0;
const byte* p = bytes;
Uint left = bin_size;

while (left) {
if (++num_chars > MAX_ATOM_CHARACTERS) {
goto system_limit;
}
if ((p[0] & 0x80) == 0) {
++p;
--left;
}
else if (left >= 2
&& (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */
&& (p[1] & 0xC0) == 0x80) {
p += 2;
left -= 2;
}
else goto badarg;
}

if (!must_exist) {
res = erts_atom_put((byte *) bytes,
bin_size,
Expand Down
7 changes: 3 additions & 4 deletions erts/emulator/test/bif_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,9 @@ binary_to_atom(Config) when is_list(Config) ->
test_binary_to_atom(<<C/utf8>>, utf8)
end],

<<"こんにちは"/utf8>> =
atom_to_binary(test_binary_to_atom(<<"こんにちは"/utf8>>, utf8), utf8),

%% badarg failures.
fail_binary_to_atom(atom),
fail_binary_to_atom(42),
Expand All @@ -456,10 +459,6 @@ binary_to_atom(Config) when is_list(Config) ->
?BADARG(binary_to_atom(id(<<255>>), utf8)),
?BADARG(binary_to_atom(id(<<255,0>>), utf8)),
?BADARG(binary_to_atom(id(<<16#C0,16#80>>), utf8)), %Overlong 0.
[?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(256, 16#D7FF)],
[?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#E000, 16#FFFD)],
[?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#10000, 16#8FFFF)],
[?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#90000, 16#10FFFF)],

%% system_limit failures.
?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)),
Expand Down
10 changes: 5 additions & 5 deletions erts/emulator/test/code_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -336,16 +336,16 @@ get_chunk(Config) when is_list(Config) ->
{ok,my_code_test,Code} = compile:file(File, [binary]),

%% Should work.
Chunk = get_chunk_ok("Atom", Code),
Chunk = get_chunk_ok("Atom", make_sub_binary(Code)),
Chunk = get_chunk_ok("Atom", make_unaligned_sub_binary(Code)),
Chunk = get_chunk_ok("AtU8", Code),
Chunk = get_chunk_ok("AtU8", make_sub_binary(Code)),
Chunk = get_chunk_ok("AtU8", make_unaligned_sub_binary(Code)),

%% Should fail.
{'EXIT',{badarg,_}} = (catch code:get_chunk(bit_sized_binary(Code), "Atom")),
{'EXIT',{badarg,_}} = (catch code:get_chunk(bit_sized_binary(Code), "AtU8")),
{'EXIT',{badarg,_}} = (catch code:get_chunk(Code, "bad chunk id")),

%% Invalid beam code or missing chunk should return 'undefined'.
undefined = code:get_chunk(<<"not a beam module">>, "Atom"),
undefined = code:get_chunk(<<"not a beam module">>, "AtU8"),
undefined = code:get_chunk(Code, "XXXX"),

ok.
Expand Down
7 changes: 5 additions & 2 deletions lib/compiler/src/beam_asm.erl
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) ->

%% Create the atom table chunk.

{NumAtoms, AtomTab} = beam_dict:atom_table(Dict),
AtomChunk = chunk(<<"Atom">>, <<NumAtoms:32>>, AtomTab),
{NumAtoms, AtomTab} = beam_dict:atom_table(Dict, utf8),
AtomChunk = chunk(atom_chunk_name(utf8), <<NumAtoms:32>>, AtomTab),

%% Create the import table chunk.

Expand Down Expand Up @@ -170,6 +170,9 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) ->
end,
build_form(<<"BEAM">>, Chunks).

atom_chunk_name(utf8) -> <<"AtU8">>;
atom_chunk_name(latin1) -> <<"Atom">>.

%% finalize_fun_table(Essentials, MD5) -> FinalizedEssentials
%% Update the 'old_uniq' field in the entry for each fun in the
%% 'FunT' chunk. We'll use part of the MD5 for the module as a
Expand Down
12 changes: 6 additions & 6 deletions lib/compiler/src/beam_dict.erl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
-export([new/0,opcode/2,highest_opcode/1,
atom/2,local/4,export/4,import/4,
string/2,lambda/3,literal/2,line/2,fname/2,
atom_table/1,local_table/1,export_table/1,import_table/1,
atom_table/2,local_table/1,export_table/1,import_table/1,
string_table/1,lambda_table/1,literal_table/1,
line_table/1]).

Expand Down Expand Up @@ -194,15 +194,15 @@ fname(Name, #asm{fnames=Fnames}=Dict) ->
end.

%% Returns the atom table.
%% atom_table(Dict) -> {LastIndex,[Length,AtomString...]}
-spec atom_table(bdict()) -> {non_neg_integer(), [[non_neg_integer(),...]]}.
%% atom_table(Dict, Encoding) -> {LastIndex,[Length,AtomString...]}
-spec atom_table(bdict(), latin1 | utf8) -> {non_neg_integer(), [[non_neg_integer(),...]]}.

atom_table(#asm{atoms=Atoms}) ->
atom_table(#asm{atoms=Atoms}, Encoding) ->
NumAtoms = maps:size(Atoms),
Sorted = lists:keysort(2, maps:to_list(Atoms)),
{NumAtoms,[begin
L = atom_to_list(A),
[length(L)|L]
L = atom_to_binary(A, Encoding),
[byte_size(L)|L]
end || {A,_} <- Sorted]}.

%% Returns the table of local functions.
Expand Down
4 changes: 2 additions & 2 deletions lib/compiler/src/beam_disasm.erl
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,10 @@ file(File) ->
%%-----------------------------------------------------------------------

process_chunks(F) ->
case beam_lib:chunks(F, [atoms,"Code","StrT",
case beam_lib:chunks(F, [utf8_atoms,"Code","StrT",
indexed_imports,labeled_exports]) of
{ok,{Module,
[{atoms,AtomsList},{"Code",CodeBin},{"StrT",StrBin},
[{utf8_atoms,AtomsList},{"Code",CodeBin},{"StrT",StrBin},
{indexed_imports,ImportsList},{labeled_exports,Exports}]}} ->
Atoms = mk_atoms(AtomsList),
LambdaBin = optional_chunk(F, "FunT"),
Expand Down
Loading

0 comments on commit 8b5121e

Please sign in to comment.