Skip to content

Commit

Permalink
Add new AtU8 beam chunk
Browse files Browse the repository at this point in the history
The new chunk stores atoms encoded in UTF-8.

beam_lib has also been modified to handle the new
'utf8_atoms' attribute while the 'atoms' attribute
may be a missing chunk from now on.

The binary_to_atom/2 BIF can now encode any utf8
binary with up to 255 characters.
  • Loading branch information
José Valim committed Sep 26, 2016
1 parent 7228e3e commit e426d18
Show file tree
Hide file tree
Showing 13 changed files with 197 additions and 135 deletions.
35 changes: 24 additions & 11 deletions erts/emulator/beam/atom.c
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,10 @@ static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp)
}

/*
* erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
* erts_atom_put_index() may fail. Returns negative indexes for errors.
*/
Eterm
erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
int
erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
{
byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
const byte *text = name;
Expand All @@ -253,7 +253,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
if (trunc)
tlen = 0;
else
return THE_NON_VALUE;
return ATOM_MAX_CHARS_ERROR;
}

switch (enc) {
Expand All @@ -262,7 +262,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
if (trunc)
tlen = MAX_ATOM_CHARACTERS;
else
return THE_NON_VALUE;
return ATOM_MAX_CHARS_ERROR;
}
#ifdef DEBUG
for (aix = 0; aix < len; aix++) {
Expand All @@ -276,15 +276,15 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
if (trunc)
tlen = MAX_ATOM_CHARACTERS;
else
return THE_NON_VALUE;
return ATOM_MAX_CHARS_ERROR;
}
no_latin1_chars = tlen;
latin1_to_utf8(utf8_copy, &text, &tlen);
break;
case ERTS_ATOM_ENC_UTF8:
/* First sanity check; need to verify later */
if (tlen > MAX_ATOM_SZ_LIMIT && !trunc)
return THE_NON_VALUE;
return ATOM_MAX_CHARS_ERROR;
break;
}

Expand All @@ -295,7 +295,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
atom_read_unlock();
if (aix >= 0) {
/* Already in table no need to verify it */
return make_atom(aix);
return aix;
}

if (enc == ERTS_ATOM_ENC_UTF8) {
Expand All @@ -314,13 +314,13 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
case ERTS_UTF8_OK_MAX_CHARS:
/* Truncated... */
if (!trunc)
return THE_NON_VALUE;
return ATOM_MAX_CHARS_ERROR;
ASSERT(no_chars == MAX_ATOM_CHARACTERS);
tlen = err_pos - text;
break;
default:
/* Bad utf8... */
return THE_NON_VALUE;
return ATOM_BAD_ENCODING_ERROR;
}
}

Expand All @@ -333,7 +333,20 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
atom_write_lock();
aix = index_put(&erts_atom_table, (void*) &a);
atom_write_unlock();
return make_atom(aix);
return aix;
}

/*
* erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
*/
Eterm
erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
{
int aix = erts_atom_put_index(name, len, enc, trunc);
if (aix >= 0)
return make_atom(aix);
else
return THE_NON_VALUE;
}

Eterm
Expand Down
3 changes: 3 additions & 0 deletions erts/emulator/beam/atom.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */
#define ATOM_LIMIT (1024*1024)
#define MIN_ATOM_TABLE_SIZE 8192
#define ATOM_BAD_ENCODING_ERROR -1
#define ATOM_MAX_CHARS_ERROR -2

#ifndef ARCH_32
/* Internal atom cache needs MAX_ATOM_TABLE_SIZE to be less than an
Expand Down Expand Up @@ -133,6 +135,7 @@ int atom_table_sz(void); /* table size in bytes, excluding stored objects */

Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */
Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
int erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
int atom_erase(byte*, int);
int atom_static_put(byte*, int);
void init_atom_table(void);
Expand Down
66 changes: 48 additions & 18 deletions erts/emulator/beam/beam_load.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,15 @@ typedef struct {
#define STR_CHUNK 2
#define IMP_CHUNK 3
#define EXP_CHUNK 4
#define NUM_MANDATORY 5
#define MIN_MANDATORY 1
#define MAX_MANDATORY 5

#define LAMBDA_CHUNK 5
#define LITERAL_CHUNK 6
#define ATTR_CHUNK 7
#define COMPILE_CHUNK 8
#define LINE_CHUNK 9
#define UTF8_ATOM_CHUNK 10

#define NUM_CHUNK_TYPES (sizeof(chunk_types)/sizeof(chunk_types[0]))

Expand All @@ -171,9 +173,13 @@ typedef struct {

static Uint chunk_types[] = {
/*
* Mandatory chunk types -- these MUST be present.
* Atom chunk types -- Atom or AtU8 MUST be present.
*/
MakeIffId('A', 't', 'o', 'm'), /* 0 */

/*
* Mandatory chunk types -- these MUST be present.
*/
MakeIffId('C', 'o', 'd', 'e'), /* 1 */
MakeIffId('S', 't', 'r', 'T'), /* 2 */
MakeIffId('I', 'm', 'p', 'T'), /* 3 */
Expand All @@ -187,6 +193,7 @@ static Uint chunk_types[] = {
MakeIffId('A', 't', 't', 'r'), /* 7 */
MakeIffId('C', 'I', 'n', 'f'), /* 8 */
MakeIffId('L', 'i', 'n', 'e'), /* 9 */
MakeIffId('A', 't', 'U', '8'), /* 10 */
};

/*
Expand Down Expand Up @@ -485,9 +492,9 @@ static Eterm stub_insert_new_code(Process *c_p, ErtsProcLocks c_p_locks,
BeamCodeHeader* code, Uint size);
static int init_iff_file(LoaderState* stp, byte* code, Uint size);
static int scan_iff_file(LoaderState* stp, Uint* chunk_types,
Uint num_types, Uint num_mandatory);
Uint num_types);
static int verify_chunks(LoaderState* stp);
static int load_atom_table(LoaderState* stp);
static int load_atom_table(LoaderState* stp, ErtsAtomEncoding enc);
static int load_import_table(LoaderState* stp);
static int read_export_table(LoaderState* stp);
static int is_bif(Eterm mod, Eterm func, unsigned arity);
Expand Down Expand Up @@ -626,7 +633,7 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
CHKALLOC();
CHKBLK(ERTS_ALC_T_CODE,stp->code);
if (!init_iff_file(stp, code, unloaded_size) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
goto load_error;
}
Expand Down Expand Up @@ -671,9 +678,16 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
*/

CHKBLK(ERTS_ALC_T_CODE,stp->code);
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp)) {
goto load_error;
if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
goto load_error;
}
} else {
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
goto load_error;
}
}

/*
Expand Down Expand Up @@ -1198,7 +1212,7 @@ init_iff_file(LoaderState* stp, byte* code, Uint size)
* Scan the IFF file. The header should have been verified by init_iff_file().
*/
static int
scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types, Uint num_mandatory)
scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types)
{
Uint count;
Uint id;
Expand Down Expand Up @@ -1277,7 +1291,16 @@ verify_chunks(LoaderState* stp)
MD5_CTX context;

MD5Init(&context);
for (i = 0; i < NUM_MANDATORY; i++) {

if (stp->chunks[UTF8_ATOM_CHUNK].start != NULL) {
MD5Update(&context, stp->chunks[UTF8_ATOM_CHUNK].start, stp->chunks[UTF8_ATOM_CHUNK].size);
} else if (stp->chunks[ATOM_CHUNK].start != NULL) {
MD5Update(&context, stp->chunks[ATOM_CHUNK].start, stp->chunks[ATOM_CHUNK].size);
} else {
LoadError0(stp, "mandatory chunk of type 'Atom' or 'AtU8' not found\n");
}

for (i = MIN_MANDATORY; i < MAX_MANDATORY; i++) {
if (stp->chunks[i].start != NULL) {
MD5Update(&context, stp->chunks[i].start, stp->chunks[i].size);
} else {
Expand Down Expand Up @@ -1338,7 +1361,7 @@ verify_chunks(LoaderState* stp)
}

static int
load_atom_table(LoaderState* stp)
load_atom_table(LoaderState* stp, ErtsAtomEncoding enc)
{
unsigned int i;

Expand All @@ -1357,7 +1380,7 @@ load_atom_table(LoaderState* stp)

GetByte(stp, n);
GetString(stp, atom, n);
stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1);
stp->atom[i] = erts_atom_put(atom, n, enc, 1);
}

/*
Expand Down Expand Up @@ -5922,7 +5945,7 @@ code_get_chunk_2(BIF_ALIST_2)
goto error;
}
if (!init_iff_file(stp, start, binary_size(Bin)) ||
!scan_iff_file(stp, &chunk, 1, 1) ||
!scan_iff_file(stp, &chunk, 1) ||
stp->chunks[0].start == NULL) {
res = am_undefined;
goto done;
Expand Down Expand Up @@ -5971,7 +5994,7 @@ code_module_md5_1(BIF_ALIST_1)
}
stp->module = THE_NON_VALUE; /* Suppress diagnostiscs */
if (!init_iff_file(stp, bytes, binary_size(Bin)) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
res = am_undefined;
goto done;
Expand Down Expand Up @@ -6324,17 +6347,24 @@ erts_make_stub_module(Process* p, Eterm Mod, Eterm Beam, Eterm Info)
if (!init_iff_file(stp, bytes, size)) {
goto error;
}
if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
goto error;
}
define_file(stp, "code chunk header", CODE_CHUNK);
if (!read_code_header(stp)) {
goto error;
}
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp)) {
goto error;
if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
goto error;
}
} else {
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
goto error;
}
}
define_file(stp, "export table", EXP_CHUNK);
if (!stub_read_export_table(stp)) {
Expand Down
83 changes: 33 additions & 50 deletions erts/emulator/beam/erl_unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1895,69 +1895,52 @@ binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist)
BIF_ERROR(proc, BADARG);
}
bin_size = binary_size(bin);
Eterm a;

if (enc == am_latin1) {
Eterm a;
if (bin_size > MAX_ATOM_CHARACTERS) {
system_limit:
erts_free_aligned_binary_bytes(temp_alloc);
BIF_ERROR(proc, SYSTEM_LIMIT);
}
if (!must_exist) {
a = erts_atom_put((byte *) bytes,
bin_size,
ERTS_ATOM_ENC_LATIN1,
0);
erts_free_aligned_binary_bytes(temp_alloc);
if (is_non_value(a))
goto badarg;
BIF_RET(a);
} else if (erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) {
erts_free_aligned_binary_bytes(temp_alloc);
BIF_RET(a);
} else {
int lix = erts_atom_put_index((byte *) bytes,
bin_size,
ERTS_ATOM_ENC_LATIN1,
0);
if (lix == ATOM_BAD_ENCODING_ERROR) {
badarg:
erts_free_aligned_binary_bytes(temp_alloc);
BIF_ERROR(proc, BADARG);
} else if (lix == ATOM_MAX_CHARS_ERROR) {
system_limit:
erts_free_aligned_binary_bytes(temp_alloc);
BIF_ERROR(proc, SYSTEM_LIMIT);
}

a = make_atom(lix);
} else if (!erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) {
goto badarg;
}
} else if (enc == am_utf8 || enc == am_unicode) {
Eterm res;
Uint num_chars = 0;
const byte* p = bytes;
Uint left = bin_size;

while (left) {
if (++num_chars > MAX_ATOM_CHARACTERS) {
} else if (enc == am_utf8 || enc == am_unicode) {
if (!must_exist) {
int uix = erts_atom_put_index((byte *) bytes,
bin_size,
ERTS_ATOM_ENC_UTF8,
0);
if (uix == ATOM_BAD_ENCODING_ERROR) {
goto badarg;
} else if (uix == ATOM_MAX_CHARS_ERROR) {
goto system_limit;
}
if ((p[0] & 0x80) == 0) {
++p;
--left;
}
else if (left >= 2
&& (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */
&& (p[1] & 0xC0) == 0x80) {
p += 2;
left -= 2;
}
else goto badarg;
}

if (!must_exist) {
res = erts_atom_put((byte *) bytes,
bin_size,
ERTS_ATOM_ENC_UTF8,
0);
a = make_atom(uix);
}
else if (!erts_atom_get((char*)bytes, bin_size, &res, ERTS_ATOM_ENC_UTF8)) {
else if (!erts_atom_get((char*)bytes, bin_size, &a, ERTS_ATOM_ENC_UTF8)) {
goto badarg;
}
erts_free_aligned_binary_bytes(temp_alloc);
if (is_non_value(res))
goto badarg;
BIF_RET(res);
} else {
badarg:
erts_free_aligned_binary_bytes(temp_alloc);
BIF_ERROR(proc, BADARG);
goto badarg;
}

erts_free_aligned_binary_bytes(temp_alloc);
BIF_RET(a);
}

BIF_RETTYPE binary_to_atom_2(BIF_ALIST_2)
Expand Down
Loading

0 comments on commit e426d18

Please sign in to comment.