@@ -63,6 +63,39 @@ void encodeUTF8(char *&dst, uint32_t cp) {
63
63
dst = d;
64
64
}
65
65
66
+ // / The following logic is a combination of ES14 11.1.4 CodePointAt() and
67
+ // / what https://infra.spec.whatwg.org/#strings says about what to do with
68
+ // / singular surrogates: "To convert a string into a scalar value string,
69
+ // / replace any surrogates with U+FFFD." Therefore, if we encounter any lone
70
+ // / surrogate, replace the value with UNICODE_REPLACEMENT_CHARACTER (U+FFFD).
71
+ // / The result of this process is that the enclosing for-loop processes only
72
+ // / scalar values (aka a code point that is not a surrogate).
73
+ // / \param cur Iterator pointing to the current character
74
+ // / \param end Iterator pointing to the end of the string
75
+ // / \return std::pair with first element being the Unicode code point, and the
76
+ // / second being how many code point units were consumed
77
+ static std::pair<char32_t , size_t > convertToCodePointAt (
78
+ llvh::ArrayRef<char16_t >::iterator cur,
79
+ llvh::ArrayRef<char16_t >::iterator end) {
80
+ char16_t c = cur[0 ];
81
+ if (isLowSurrogate (c)) {
82
+ // Unpaired low surrogate.
83
+ return {UNICODE_REPLACEMENT_CHARACTER, 1 };
84
+ } else if (isHighSurrogate (c)) {
85
+ // Leading high surrogate. See if the next character is a low surrogate.
86
+ if (cur + 1 == end || !isLowSurrogate (cur[1 ])) {
87
+ // Trailing or unpaired high surrogate.
88
+ return {UNICODE_REPLACEMENT_CHARACTER, 1 };
89
+ } else {
90
+ // Decode surrogate pair and increment, because we consumed two chars.
91
+ return {utf16SurrogatePairToCodePoint (c, cur[1 ]), 2 };
92
+ }
93
+ } else {
94
+ // Not a surrogate.
95
+ return {c, 1 };
96
+ }
97
+ }
98
+
66
99
bool convertUTF16ToUTF8WithReplacements (
67
100
std::string &out,
68
101
llvh::ArrayRef<char16_t > input,
@@ -85,40 +118,62 @@ bool convertUTF16ToUTF8WithReplacements(
85
118
continue ;
86
119
}
87
120
88
- // The following logic is a combination of ES14 11.1.4 CodePointAt() and
89
- // what https://infra.spec.whatwg.org/#strings says about what to do with
90
- // singular surrogates: "To convert a string into a scalar value string,
91
- // replace any surrogates with U+FFFD." Therefore, if we encounter any lone
92
- // surrogate, replace the value with UNICODE_REPLACEMENT_CHARACTER (U+FFFD).
93
- // The result of this process is that the enclosing for-loop processes only
94
- // scalar values (aka a code point that is not a surrogate).
95
- char32_t c32;
96
- if (isLowSurrogate (cur[0 ])) {
97
- // Unpaired low surrogate.
98
- c32 = UNICODE_REPLACEMENT_CHARACTER;
99
- } else if (isHighSurrogate (cur[0 ])) {
100
- // Leading high surrogate. See if the next character is a low surrogate.
101
- if (cur + 1 == end || !isLowSurrogate (cur[1 ])) {
102
- // Trailing or unpaired high surrogate.
103
- c32 = UNICODE_REPLACEMENT_CHARACTER;
104
- } else {
105
- // Decode surrogate pair and increment, because we consumed two chars.
106
- c32 = utf16SurrogatePairToCodePoint (cur[0 ], cur[1 ]);
107
- ++cur;
121
+ auto [c32, inputConsumed] = convertToCodePointAt (cur, end);
122
+ cur += (inputConsumed - 1 );
123
+
124
+ // The code point to be encoded here is guaranteed to be a valid unicode
125
+ // code point and not a surrogate. Because of the convertToCodePointAt()
126
+ // process.
127
+ std::array<char , UTF8CodepointMaxBytes> buff;
128
+ char *ptr = buff.data ();
129
+ encodeUTF8 (ptr, c32);
130
+ out.insert (out.end (), buff.data (), ptr);
131
+ }
132
+ return cur == end;
133
+ }
134
+
135
+ std::pair<uint32_t , uint32_t > convertUTF16ToUTF8BufferWithReplacements (
136
+ llvh::MutableArrayRef<uint8_t > outBuffer,
137
+ llvh::ArrayRef<char16_t > input) {
138
+ uint32_t numRead = 0 ;
139
+ uint32_t numWritten = 0 ;
140
+ uint8_t *writtenPtr = outBuffer.begin ();
141
+ auto end = input.end ();
142
+ for (auto cur = input.begin (); cur < end; ++cur) {
143
+ char16_t c = cur[0 ];
144
+ // ASCII fast-path.
145
+ if (LLVM_LIKELY (c <= 0x7F )) {
146
+ if (numWritten + 1 > outBuffer.size ()) {
147
+ break ;
108
148
}
109
- } else {
110
- // Not a surrogate.
111
- c32 = c;
149
+ *writtenPtr = static_cast <char >(c);
150
+ writtenPtr++;
151
+ numWritten++;
152
+ numRead++;
153
+ continue ;
112
154
}
113
155
114
- // The code point to be converted here is guaranteed to be a valid unicode
115
- // code point and not a surrogate. Because of the conversion above.
116
- char buff[UTF8CodepointMaxBytes];
117
- char *ptr = buff;
156
+ auto [c32, inputConsumed] = convertToCodePointAt (cur, end);
157
+ cur += (inputConsumed - 1 );
158
+
159
+ // The code point to be encoded here is guaranteed to be a valid unicode
160
+ // code point and not a surrogate. Because of the convertToCodePointAt()
161
+ // process.
162
+ std::array<char , UTF8CodepointMaxBytes> buff;
163
+ char *ptr = buff.data ();
118
164
encodeUTF8 (ptr, c32);
119
- out.insert (out.end (), buff, ptr);
165
+
166
+ size_t convertedLength = ptr - buff.data ();
167
+ if (numWritten + convertedLength > outBuffer.size ()) {
168
+ break ;
169
+ }
170
+ std::memcpy (writtenPtr, buff.data (), convertedLength);
171
+ writtenPtr += convertedLength;
172
+ numWritten += convertedLength;
173
+ numRead += inputConsumed;
120
174
}
121
- return cur == end;
175
+
176
+ return {numRead, numWritten};
122
177
}
123
178
124
179
void convertUTF16ToUTF8WithSingleSurrogates (
0 commit comments