Skip to content

Commit

Permalink
convert_UTF8_to_JSON: repurpose the escape tables into size tables
Browse files Browse the repository at this point in the history
Since we're looking up the table anyway, we might as well store the
UTF-8 char length in it. For single byte characters that don't need
escaping we store `0`.

This helps on strings with lots of multi-byte characters:

Before:

```
== Encoding mostly utf8 (20004001 bytes)
ruby 3.3.4 (2024-07-09 revision be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json     6.000 i/100ms
                  oj    10.000 i/100ms
           rapidjson     2.000 i/100ms
Calculating -------------------------------------
                json     67.978 (± 1.5%) i/s   (14.71 ms/i) -    342.000 in   5.033062s
                  oj    100.876 (± 2.0%) i/s    (9.91 ms/i) -    510.000 in   5.058080s
           rapidjson     26.389 (± 7.6%) i/s   (37.89 ms/i) -    132.000 in   5.027681s

Comparison:
                json:       68.0 i/s
                  oj:      100.9 i/s - 1.48x  faster
           rapidjson:       26.4 i/s - 2.58x  slower
```

After:

```
== Encoding mostly utf8 (20004001 bytes)
ruby 3.3.4 (2024-07-09 revision be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json     7.000 i/100ms
                  oj    10.000 i/100ms
           rapidjson     2.000 i/100ms
Calculating -------------------------------------
                json     75.187 (± 2.7%) i/s   (13.30 ms/i) -    378.000 in   5.030111s
                  oj     95.196 (± 2.1%) i/s   (10.50 ms/i) -    480.000 in   5.043565s
           rapidjson     25.969 (± 3.9%) i/s   (38.51 ms/i) -    130.000 in   5.011471s

Comparison:
                json:       75.2 i/s
                  oj:       95.2 i/s - 1.27x  faster
           rapidjson:       26.0 i/s - 2.90x  slower
```
  • Loading branch information
byroot committed Oct 18, 2024
1 parent f8166c2 commit 31d811f
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 64 deletions.
3 changes: 3 additions & 0 deletions benchmark/encoder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [
benchmark_encoding "small nested array", [[1,2,3,4,5]]*10
benchmark_encoding "small hash", { "username" => "jhawthorn", "id" => 123, "event" => "wrote json serializer" }

# On this one we're a bit slower (~25%).
benchmark_encoding "mostly utf8", ([("€" * 3333)] * 2000), except: %i(json_state)

# On these three benchmarks we perform well. Either on par or very closely faster/slower
benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 2000), except: %i(json_state)
benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json"), except: %i(json_state)
Expand Down
162 changes: 98 additions & 64 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,65 +25,71 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend;
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256], bool out_script_safe)
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

const char *ptr = RSTRING_PTR(str);
unsigned long len = RSTRING_LEN(str);

unsigned long beg = 0, pos;
unsigned long beg = 0, pos = 0;

for (pos = 0; pos < len;) {
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;

while (pos < len) {
unsigned char ch = ptr[pos];
unsigned char ch_len = escape_table[ch];
/* JSON encoding */
if (escape_table[ch]) {
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
switch (ch) {
case '"': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': FLUSH_POS(1); fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': FLUSH_POS(1); fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': FLUSH_POS(1); fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': FLUSH_POS(1); fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': FLUSH_POS(1); fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': FLUSH_POS(1); fbuffer_append(out_buffer, "\\t", 2); break;
default: {
if ((ch & 0x80) == 0x00) { /* leading 1 bit is 0b0 */
FLUSH_POS(1);
scratch[2] = hexdig[ch >> 12];
scratch[3] = hexdig[(ch >> 8) & 0xf];
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
} else if ((ch & 0xE0) == 0xC0) { /* leading 3 bits are 0b110 */
pos += 2;
} else if ((ch & 0xF0) == 0xE0) { /* leading 4 bits are 0b1110 */
unsigned char b2 = ptr[pos + 1];

if (RB_UNLIKELY(ch_len)) {
switch (ch_len) {
case 0:
pos++;
break;
case 1: {
FLUSH_POS(1);
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default: {
scratch[2] = hexdig[ch >> 12];
scratch[3] = hexdig[(ch >> 8) & 0xf];
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
break;
}
}
break;
}
case 3: {
unsigned char b2 = ptr[pos + 1];
if (out_script_safe && b2 == 0x80) {
unsigned char b3 = ptr[pos + 2];
if (out_script_safe && (b2 == 0x80)) {
if (b3 == 0xA8) {
FLUSH_POS(3);
fprintf(stderr, "escape: \\u2028 pos = %ld\n", pos);
fbuffer_append(out_buffer, "\\u2028", 6);
} else if (b3 == 0xA9) {
FLUSH_POS(3);
fprintf(stderr, "escape: \\u2029 pos = %ld\n", pos);
fbuffer_append(out_buffer, "\\u2029", 6);
} else {
pos += 3;
}
} else {
pos += 3;
if (b3 == 0xA8) {
FLUSH_POS(3);
fprintf(stderr, "escape: \\u2028 pos = %ld\n", pos);
fbuffer_append(out_buffer, "\\u2028", 6);
break;
} else if (b3 == 0xA9) {
FLUSH_POS(3);
fprintf(stderr, "escape: \\u2029 pos = %ld\n", pos);
fbuffer_append(out_buffer, "\\u2029", 6);
break;
}
} else if ((ch & 0xF8) == 0xF0) { /* leading 5 bits are 0b11110 */
pos += 4;
} else {
// This should be unreachable
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
}
// fallthrough
}
default:
pos += ch_len;
break;
}
} else {
pos++;
Expand All @@ -98,29 +104,57 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool esca
RB_GC_GUARD(str);
}

static const bool escape_table[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
static const char escape_table[256] = {
// ASCII Control Characters
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// ASCII Characters
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
// Continuation byte
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// First byte of a 2-byte code point
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
// First byte of a 4-byte code point
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
//First byte of a 4+byte code point
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
};

static const bool script_safe_escape_table[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' and '/' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
static const char script_safe_escape_table[256] = {
// ASCII Control Characters
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// ASCII Characters
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
// Continuation byte
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// First byte of a 2-byte code point
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
// First byte of a 4-byte code point
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
//First byte of a 4+byte code point
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
};

static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256])
static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
Expand Down

0 comments on commit 31d811f

Please sign in to comment.