Skip to content
This repository was archived by the owner on Jun 17, 2022. It is now read-only.

Commit f32df99

Browse files
committed
Merge branch '2016_04_unicode' into bitcoin
Merge UTF-8 support See jgarzik#22
2 parents 280b191 + c9a716c commit f32df99

10 files changed

+178
-34
lines changed

Makefile.am

+7-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ ACLOCAL_AMFLAGS = -I build-aux/m4
33
.INTERMEDIATE: $(GENBIN)
44

55
include_HEADERS = include/univalue.h
6-
noinst_HEADERS = lib/univalue_escapes.h
6+
noinst_HEADERS = lib/univalue_escapes.h lib/univalue_utffilter.h
77

88
lib_LTLIBRARIES = libunivalue.la
99

@@ -73,6 +73,10 @@ TEST_FILES = \
7373
$(TEST_DATA_DIR)/fail35.json \
7474
$(TEST_DATA_DIR)/fail36.json \
7575
$(TEST_DATA_DIR)/fail37.json \
76+
$(TEST_DATA_DIR)/fail38.json \
77+
$(TEST_DATA_DIR)/fail39.json \
78+
$(TEST_DATA_DIR)/fail40.json \
79+
$(TEST_DATA_DIR)/fail41.json \
7680
$(TEST_DATA_DIR)/fail3.json \
7781
$(TEST_DATA_DIR)/fail4.json \
7882
$(TEST_DATA_DIR)/fail5.json \
@@ -83,6 +87,7 @@ TEST_FILES = \
8387
$(TEST_DATA_DIR)/pass1.json \
8488
$(TEST_DATA_DIR)/pass2.json \
8589
$(TEST_DATA_DIR)/pass3.json \
86-
$(TEST_DATA_DIR)/round1.json
90+
$(TEST_DATA_DIR)/round1.json \
91+
$(TEST_DATA_DIR)/round2.json
8792

8893
EXTRA_DIST=$(TEST_FILES) $(GEN_SRCS)

lib/univalue_read.cpp

+15-22
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <vector>
77
#include <stdio.h>
88
#include "univalue.h"
9+
#include "univalue_utffilter.h"
910

1011
using namespace std;
1112

@@ -174,41 +175,31 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed,
174175
raw++; // skip "
175176

176177
string valStr;
178+
JSONUTF8StringFilter writer(valStr);
177179

178180
while (*raw) {
179-
if (*raw < 0x20)
181+
if ((unsigned char)*raw < 0x20)
180182
return JTOK_ERR;
181183

182184
else if (*raw == '\\') {
183185
raw++; // skip backslash
184186

185187
switch (*raw) {
186-
case '"': valStr += "\""; break;
187-
case '\\': valStr += "\\"; break;
188-
case '/': valStr += "/"; break;
189-
case 'b': valStr += "\b"; break;
190-
case 'f': valStr += "\f"; break;
191-
case 'n': valStr += "\n"; break;
192-
case 'r': valStr += "\r"; break;
193-
case 't': valStr += "\t"; break;
188+
case '"': writer.push_back('\"'); break;
189+
case '\\': writer.push_back('\\'); break;
190+
case '/': writer.push_back('/'); break;
191+
case 'b': writer.push_back('\b'); break;
192+
case 'f': writer.push_back('\f'); break;
193+
case 'n': writer.push_back('\n'); break;
194+
case 'r': writer.push_back('\r'); break;
195+
case 't': writer.push_back('\t'); break;
194196

195197
case 'u': {
196198
unsigned int codepoint;
197199
if (hatoui(raw + 1, raw + 1 + 4, codepoint) !=
198200
raw + 1 + 4)
199201
return JTOK_ERR;
200-
201-
if (codepoint <= 0x7f)
202-
valStr.push_back((char)codepoint);
203-
else if (codepoint <= 0x7FF) {
204-
valStr.push_back((char)(0xC0 | (codepoint >> 6)));
205-
valStr.push_back((char)(0x80 | (codepoint & 0x3F)));
206-
} else if (codepoint <= 0xFFFF) {
207-
valStr.push_back((char)(0xE0 | (codepoint >> 12)));
208-
valStr.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
209-
valStr.push_back((char)(0x80 | (codepoint & 0x3F)));
210-
}
211-
202+
writer.push_back_u(codepoint);
212203
raw += 4;
213204
break;
214205
}
@@ -226,11 +217,13 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed,
226217
}
227218

228219
else {
229-
valStr += *raw;
220+
writer.push_back(*raw);
230221
raw++;
231222
}
232223
}
233224

225+
if (!writer.finalize())
226+
return JTOK_ERR;
234227
tokenVal = valStr;
235228
consumed = (raw - rawStart);
236229
return JTOK_STRING;

lib/univalue_utffilter.h

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// Copyright 2016 Wladimir J. van der Laan
2+
// Distributed under the MIT software license, see the accompanying
3+
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4+
#ifndef UNIVALUE_UTFFILTER_H
5+
#define UNIVALUE_UTFFILTER_H
6+
7+
#include <string>
8+
9+
/**
10+
* Filter that generates and validates UTF-8, as well as collates UTF-16
11+
* surrogate pairs as specified in RFC4627.
12+
*/
13+
class JSONUTF8StringFilter
14+
{
15+
public:
16+
JSONUTF8StringFilter(std::string &s):
17+
str(s), is_valid(true), codepoint(0), state(0), surpair(0)
18+
{
19+
}
20+
// Write single 8-bit char (may be part of UTF-8 sequence)
21+
void push_back(unsigned char ch)
22+
{
23+
if (state == 0) {
24+
if (ch < 0x80) // 7-bit ASCII, fast direct pass-through
25+
str.push_back(ch);
26+
else if (ch < 0xc0) // Mid-sequence character, invalid in this state
27+
is_valid = false;
28+
else if (ch < 0xe0) { // Start of 2-byte sequence
29+
codepoint = (ch & 0x1f) << 6;
30+
state = 6;
31+
} else if (ch < 0xf0) { // Start of 3-byte sequence
32+
codepoint = (ch & 0x0f) << 12;
33+
state = 12;
34+
} else if (ch < 0xf8) { // Start of 4-byte sequence
35+
codepoint = (ch & 0x07) << 18;
36+
state = 18;
37+
} else // Reserved, invalid
38+
is_valid = false;
39+
} else {
40+
if ((ch & 0xc0) != 0x80) // Not a continuation, invalid
41+
is_valid = false;
42+
state -= 6;
43+
codepoint |= (ch & 0x3f) << state;
44+
if (state == 0)
45+
push_back_u(codepoint);
46+
}
47+
}
48+
// Write codepoint directly, possibly collating surrogate pairs
49+
void push_back_u(unsigned int codepoint)
50+
{
51+
if (state) // Only accept full codepoints in open state
52+
is_valid = false;
53+
if (codepoint >= 0xD800 && codepoint < 0xDC00) { // First half of surrogate pair
54+
if (surpair) // Two subsequent surrogate pair openers - fail
55+
is_valid = false;
56+
else
57+
surpair = codepoint;
58+
} else if (codepoint >= 0xDC00 && codepoint < 0xE000) { // Second half of surrogate pair
59+
if (surpair) { // Open surrogate pair, expect second half
60+
// Compute code point from UTF-16 surrogate pair
61+
append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint - 0xDC00));
62+
surpair = 0;
63+
} else // Second half doesn't follow a first half - fail
64+
is_valid = false;
65+
} else {
66+
if (surpair) // First half of surrogate pair not followed by second - fail
67+
is_valid = false;
68+
else
69+
append_codepoint(codepoint);
70+
}
71+
}
72+
// Check that we're in a state where the string can be ended
73+
// No open sequences, no open surrogate pairs, etc
74+
bool finalize()
75+
{
76+
if (state || surpair)
77+
is_valid = false;
78+
return is_valid;
79+
}
80+
private:
81+
std::string &str;
82+
bool is_valid;
83+
// Current UTF-8 decoding state
84+
unsigned int codepoint;
85+
int state; // Top bit to be filled in for next UTF-8 byte, or 0
86+
87+
// Keep track of the following state to handle the following section of
88+
// RFC4627:
89+
//
90+
// To escape an extended character that is not in the Basic Multilingual
91+
// Plane, the character is represented as a twelve-character sequence,
92+
// encoding the UTF-16 surrogate pair. So, for example, a string
93+
// containing only the G clef character (U+1D11E) may be represented as
94+
// "\uD834\uDD1E".
95+
//
96+
// Two subsequent \u.... may have to be replaced with one actual codepoint.
97+
unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0
98+
99+
void append_codepoint(unsigned int codepoint)
100+
{
101+
if (codepoint <= 0x7f)
102+
str.push_back((char)codepoint);
103+
else if (codepoint <= 0x7FF) {
104+
str.push_back((char)(0xC0 | (codepoint >> 6)));
105+
str.push_back((char)(0x80 | (codepoint & 0x3F)));
106+
} else if (codepoint <= 0xFFFF) {
107+
str.push_back((char)(0xE0 | (codepoint >> 12)));
108+
str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
109+
str.push_back((char)(0x80 | (codepoint & 0x3F)));
110+
} else if (codepoint <= 0x1FFFFF) {
111+
str.push_back((char)(0xF0 | (codepoint >> 18)));
112+
str.push_back((char)(0x80 | ((codepoint >> 12) & 0x3F)));
113+
str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
114+
str.push_back((char)(0x80 | (codepoint & 0x3F)));
115+
}
116+
}
117+
};
118+
119+
#endif

lib/univalue_write.cpp

+1-10
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
#include "univalue.h"
99
#include "univalue_escapes.h"
1010

11-
// TODO: Using UTF8
12-
1311
using namespace std;
1412

1513
static string json_escape(const string& inS)
@@ -23,15 +21,8 @@ static string json_escape(const string& inS)
2321

2422
if (escStr)
2523
outS += escStr;
26-
27-
else if (ch < 0x80)
24+
else
2825
outS += ch;
29-
30-
else { // TODO handle UTF-8 properly
31-
char tmpesc[16];
32-
sprintf(tmpesc, "\\u%04x", ch);
33-
outS += tmpesc;
34-
}
3526
}
3627

3728
return outS;

test/fail38.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
["\ud834"]

test/fail39.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
["\udd61"]

test/fail40.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
["���"]

test/fail41.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[""]

test/round2.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
["a§■𐎒𝅘𝅥𝅯"]

test/unitester.cpp

+31
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ string srcdir(JSON_TEST_SRC);
2222
static bool test_failed = false;
2323

2424
#define d_assert(expr) { if (!(expr)) { test_failed = true; fprintf(stderr, "%s failed\n", filename.c_str()); } }
25+
#define f_assert(expr) { if (!(expr)) { test_failed = true; fprintf(stderr, "%s failed\n", __func__); } }
2526

2627
static std::string rtrim(std::string s)
2728
{
@@ -108,6 +109,10 @@ static const char *filenames[] = {
108109
"fail35.json",
109110
"fail36.json",
110111
"fail37.json",
112+
"fail38.json", // invalid unicode: only first half of surrogate pair
113+
"fail39.json", // invalid unicode: only second half of surrogate pair
114+
"fail40.json", // invalid unicode: broken UTF-8
115+
"fail41.json", // invalid unicode: unfinished UTF-8
111116
"fail3.json",
112117
"fail4.json", // extra comma
113118
"fail5.json",
@@ -119,14 +124,40 @@ static const char *filenames[] = {
119124
"pass2.json",
120125
"pass3.json",
121126
"round1.json", // round-trip test
127+
"round2.json", // unicode
122128
};
123129

130+
// Test \u handling
131+
void unescape_unicode_test()
132+
{
133+
UniValue val;
134+
bool testResult;
135+
// Escaped ASCII (quote)
136+
testResult = val.read("[\"\\u0022\"]");
137+
f_assert(testResult);
138+
f_assert(val[0].get_str() == "\"");
139+
// Escaped Basic Plane character, two-byte UTF-8
140+
testResult = val.read("[\"\\u0191\"]");
141+
f_assert(testResult);
142+
f_assert(val[0].get_str() == "\xc6\x91");
143+
// Escaped Basic Plane character, three-byte UTF-8
144+
testResult = val.read("[\"\\u2191\"]");
145+
f_assert(testResult);
146+
f_assert(val[0].get_str() == "\xe2\x86\x91");
147+
// Escaped Supplementary Plane character U+1d161
148+
testResult = val.read("[\"\\ud834\\udd61\"]");
149+
f_assert(testResult);
150+
f_assert(val[0].get_str() == "\xf0\x9d\x85\xa1");
151+
}
152+
124153
int main (int argc, char *argv[])
125154
{
126155
for (unsigned int fidx = 0; fidx < ARRAY_SIZE(filenames); fidx++) {
127156
runtest_file(filenames[fidx]);
128157
}
129158

159+
unescape_unicode_test();
160+
130161
return test_failed ? 1 : 0;
131162
}
132163

0 commit comments

Comments
 (0)