1818#include < windows.h>
1919#endif
2020
21- // / Determine endianness of the architecture
22- // / \return True if the architecture is little_endian
23- bool is_little_endian_arch ()
24- {
25- uint32_t i=1 ;
26- return reinterpret_cast <uint8_t &>(i) != 0 ;
27- }
28-
29- #define BUFSIZE 100
30-
3121std::string narrow (const wchar_t *s)
3222{
3323 #ifdef _WIN32
@@ -138,9 +128,10 @@ static void utf8_append_code(unsigned int c, std::string &result)
138128 }
139129}
140130
141- // / \param utf32: encoded wide string
131+ // / \param s UTF-32 encoded wide string
142132// / \return utf8-encoded string with the same unicode characters as the input.
143- std::string utf32_to_utf8 (const std::basic_string<unsigned int > &s)
133+ std::string
134+ utf32_native_endian_to_utf8 (const std::basic_string<unsigned int > &s)
144135{
145136 std::string result;
146137
@@ -166,51 +157,37 @@ std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
166157 return argv_narrow;
167158}
168159
169- // / A helper function for dealing with different UTF16 endians
170- // / \par parameters: A 16-bit integer
171- // / \return A 16-bit integer with bytes swapped
172- uint16_t do_swap_bytes (uint16_t x)
173- {
174- uint16_t b1=x & 0xFF ;
175- uint16_t b2=x & 0xFF00 ;
176- return (b1 << 8 ) | (b2 >> 8 );
177- }
178-
179-
180- void utf16_append_code (unsigned int code, bool swap_bytes, std::wstring &result)
160+ static void utf16_append_code (unsigned int code, std::wstring &result)
181161{
182162 // we do not treat 0xD800 to 0xDFFF, although
183163 // they are not valid unicode symbols
184164
185165 if (code<0xFFFF )
186- { // code is encoded as one UTF16 character
187- // we just take the code and possibly swap the bytes
188- unsigned int a=(swap_bytes)?do_swap_bytes (code):code;
189- result+=static_cast <wchar_t >(a);
166+ {
167+ // code is encoded as one UTF16 character
168+ result += static_cast <wchar_t >(code);
190169 }
191170 else // code is encoded as two UTF16 characters
192171 {
193172 // if this is valid unicode, we have
194173 // code<0x10FFFF
195174 // but let's not check it programmatically
196175
197- // encode the code in UTF16, possibly swapping bytes.
176+ // encode the code in UTF16
198177 code=code-0x10000 ;
199- unsigned int i1=((code>>10 ) & 0x3ff ) | 0xD800 ;
200- unsigned int a1=(swap_bytes)?do_swap_bytes (static_cast <uint16_t >(i1)):i1;
201- result+=static_cast <wchar_t >(a1);
202- unsigned int i2=(code & 0x3ff ) | 0xDC00 ;
203- unsigned int a2=(swap_bytes)?do_swap_bytes (static_cast <uint16_t >(i2)):i2;
204- result+=static_cast <wchar_t >(a2);
178+ const uint16_t i1 = static_cast <uint16_t >(((code >> 10 ) & 0x3ff ) | 0xD800 );
179+ result += static_cast <wchar_t >(i1);
180+ const uint16_t i2 = static_cast <uint16_t >((code & 0x3ff ) | 0xDC00 );
181+ result += static_cast <wchar_t >(i2);
205182 }
206183}
207184
208185
209- // / \par parameters: String in UTF-8 format, bool value indicating whether the
210- // / endianness should be different from the architecture one.
186+ // / Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
187+ // / \par parameters: String in UTF-8 format
211188// / \return String in UTF-16 format. The encoding follows the endianness of the
212189// / architecture iff swap_bytes is true.
213- std::wstring utf8_to_utf16 (const std::string& in, bool swap_bytes )
190+ std::wstring utf8_to_utf16_native_endian (const std::string &in )
214191{
215192 std::wstring result;
216193 result.reserve (in.size ());
@@ -263,33 +240,17 @@ std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
263240 code=32 ;
264241 }
265242
266- utf16_append_code (code, swap_bytes, result);
243+ utf16_append_code (code, result);
267244 }
268245
269246 return result;
270247}
271248
272- // / \par parameters: String in UTF-8 format
273- // / \return String in UTF-16BE format
274- std::wstring utf8_to_utf16_big_endian (const std::string &in)
275- {
276- bool swap_bytes=is_little_endian_arch ();
277- return utf8_to_utf16 (in, swap_bytes);
278- }
279-
280- // / \par parameters: String in UTF-8 format
281- // / \return String in UTF-16LE format
282- std::wstring utf8_to_utf16_little_endian (const std::string &in)
283- {
284- bool swap_bytes=!is_little_endian_arch ();
285- return utf8_to_utf16 (in, swap_bytes);
286- }
287-
288- // / \param ch: UTF-16LE character
249+ // / \param ch: UTF-16 character in architecture-native endianness encoding
289250// / \param result: stream to receive string in US-ASCII format, with \\uxxxx
290251// / escapes for other characters
291252// / \param loc: locale to check for printable characters
292- static void utf16_little_endian_to_java (
253+ static void utf16_native_endian_to_java (
293254 const wchar_t ch,
294255 std::ostringstream &result,
295256 const std::locale &loc)
@@ -326,23 +287,23 @@ static void utf16_little_endian_to_java(
326287 }
327288}
328289
329- // / \param ch: UTF-16LE character
290+ // / \param ch: UTF-16 character in architecture-native endianness encoding
330291// / \return String in US-ASCII format, with \\uxxxx escapes for other characters
331- std::string utf16_little_endian_to_java (const wchar_t ch)
292+ std::string utf16_native_endian_to_java (const wchar_t ch)
332293{
333294 std::ostringstream result;
334295 const std::locale loc;
335- utf16_little_endian_to_java (ch, result, loc);
296+ utf16_native_endian_to_java (ch, result, loc);
336297 return result.str ();
337298}
338299
339- // / \param in: String in UTF-16LE format
300+ // / \param in: String in UTF-16 (native endianness) format
340301// / \return String in US-ASCII format, with \\uxxxx escapes for other characters
341- std::string utf16_little_endian_to_java (const std::wstring &in)
302+ std::string utf16_native_endian_to_java (const std::wstring &in)
342303{
343304 std::ostringstream result;
344305 const std::locale loc;
345306 for (const auto ch : in)
346- utf16_little_endian_to_java (ch, result, loc);
307+ utf16_native_endian_to_java (ch, result, loc);
347308 return result.str ();
348309}
0 commit comments