diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index cb054532..c35e86d9 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -5,9 +5,9 @@ #define RB_UNLIKELY(cond) (cond) #endif -static VALUE mJSON, cState, mString_Extend, eGeneratorError, eNestingError; +static VALUE mJSON, cState, mString_Extend, eGeneratorError, eNestingError, Encoding_UTF_8; -static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend; +static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend, i_encode; /* Converts in_string to a JSON string (without the wrapping '"' * characters) in FBuffer out_buffer. @@ -735,20 +735,41 @@ static void generate_json_array(FBuffer *buffer, VALUE Vstate, JSON_Generator_St fbuffer_append_char(buffer, ']'); } -static int usascii_encindex, utf8_encindex; +static int usascii_encindex, utf8_encindex, binary_encindex; -static int enc_utf8_compatible_p(int enc_idx) +static inline int enc_utf8_compatible_p(int enc_idx) { if (enc_idx == usascii_encindex) return 1; if (enc_idx == utf8_encindex) return 1; return 0; } +static inline VALUE ensure_valid_encoding(VALUE str) +{ + int encindex = RB_ENCODING_GET(str); + VALUE utf8_string; + if (RB_UNLIKELY(!enc_utf8_compatible_p(encindex))) { + if (encindex == binary_encindex) { + // For historical reason, we silently reinterpret binary strings as UTF-8 if it would work. + // TODO: Deprecate in 2.8.0 + // TODO: Remove in 3.0.0 + utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex); + switch (rb_enc_str_coderange(utf8_string)) { + case ENC_CODERANGE_7BIT: + case ENC_CODERANGE_VALID: + return utf8_string; + break; + } + } + + str = rb_funcall(str, i_encode, 1, Encoding_UTF_8); + } + return str; +} + static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_State *state, VALUE obj) { - if (!enc_utf8_compatible_p(RB_ENCODING_GET(obj))) { - obj = rb_str_export_to_enc(obj, rb_utf8_encoding()); - } + obj = ensure_valid_encoding(obj); fbuffer_append_char(buffer, '"'); @@ -1462,6 +1483,9 @@ void Init_generator(void) VALUE mNilClass = rb_define_module_under(mGeneratorMethods, "NilClass"); rb_define_method(mNilClass, "to_json", mNilClass_to_json, -1); + rb_global_variable(&Encoding_UTF_8); + Encoding_UTF_8 = rb_const_get(rb_path2class("Encoding"), rb_intern("UTF_8")); + i_to_s = rb_intern("to_s"); i_to_json = rb_intern("to_json"); i_new = rb_intern("new"); @@ -1469,7 +1493,9 @@ void Init_generator(void) i_unpack = rb_intern("unpack"); i_create_id = rb_intern("create_id"); i_extend = rb_intern("extend"); + i_encode = rb_intern("encode"); usascii_encindex = rb_usascii_encindex(); utf8_encindex = rb_utf8_encindex(); + binary_encindex = rb_ascii8bit_encindex(); } diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 62effbd0..cf0b3cef 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -1794,6 +1794,9 @@ static VALUE convert_encoding(VALUE source) } if (encindex == binary_encindex) { + // For historical reason, we silently reinterpret binary strings as UTF-8 if it would work. + // TODO: Deprecate in 2.8.0 + // TODO: Remove in 3.0.0 return rb_enc_associate_index(rb_str_dup(source), utf8_encindex); } @@ -1943,7 +1946,7 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self) } -#line 1947 "parser.c" +#line 1950 "parser.c" enum {JSON_start = 1}; enum {JSON_first_final = 10}; enum {JSON_error = 0}; @@ -1951,7 +1954,7 @@ enum {JSON_error = 0}; enum {JSON_en_main = 1}; -#line 855 "parser.rl" +#line 858 "parser.rl" /* @@ -1969,16 +1972,16 @@ static VALUE cParser_parse(VALUE self) GET_PARSER; -#line 1973 "parser.c" +#line 1976 "parser.c" { cs = JSON_start; } -#line 872 "parser.rl" +#line 875 "parser.rl" p = json->source; pe = p + json->len; -#line 1982 "parser.c" +#line 1985 "parser.c" { if ( p == pe ) goto _test_eof; @@ -2012,7 +2015,7 @@ case 1: cs = 0; goto _out; tr2: -#line 847 "parser.rl" +#line 850 "parser.rl" { char *np = JSON_parse_value(json, p, pe, &result, 0); if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;} @@ -2022,7 +2025,7 @@ cs = 0; if ( ++p == pe ) goto _test_eof10; case 10: -#line 2026 "parser.c" +#line 2029 "parser.c" switch( (*p) ) { case 13: goto st10; case 32: goto st10; @@ -2111,7 +2114,7 @@ case 9: _out: {} } -#line 875 "parser.rl" +#line 878 "parser.rl" if (cs >= JSON_first_final && p == pe) { return result; diff --git a/ext/json/ext/parser/parser.rl b/ext/json/ext/parser/parser.rl index 6ebb2f6f..73f81341 100644 --- a/ext/json/ext/parser/parser.rl +++ b/ext/json/ext/parser/parser.rl @@ -689,6 +689,9 @@ static VALUE convert_encoding(VALUE source) } if (encindex == binary_encindex) { + // For historical reason, we silently reinterpret binary strings as UTF-8 if it would work. + // TODO: Deprecate in 2.8.0 + // TODO: Remove in 3.0.0 return rb_enc_associate_index(rb_str_dup(source), utf8_encindex); } diff --git a/lib/json/pure/generator.rb b/lib/json/pure/generator.rb index 2e38f6bc..517d4e27 100644 --- a/lib/json/pure/generator.rb +++ b/lib/json/pure/generator.rb @@ -339,6 +339,7 @@ def generate(obj) private def fast_serialize_string(string, buf) # :nodoc: buf << '"' string = string.encode(::Encoding::UTF_8) unless string.encoding == ::Encoding::UTF_8 + raise GeneratorError, "source sequence is illegal/malformed utf-8" unless string.valid_encoding? if /["\\\x0-\x1f]/n.match?(string) buf << string.gsub(/["\\\x0-\x1f]/n, MAP) diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index 667baa03..7dc45e3a 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -443,6 +443,19 @@ def test_invalid_encoding_string "\x82\xAC\xEF".to_json end assert_includes error.message, "source sequence is illegal/malformed utf-8" + + error = assert_raise(JSON::GeneratorError) do + JSON.dump("\x82\xAC\xEF") + end + assert_includes error.message, "source sequence is illegal/malformed utf-8" + + assert_raise(Encoding::UndefinedConversionError) do + "\x82\xAC\xEF".b.to_json + end + + assert_raise(Encoding::UndefinedConversionError) do + JSON.dump("\x82\xAC\xEF".b) + end end if defined?(JSON::Ext::Generator) and RUBY_PLATFORM != "java"