The C++ unicode_traits class template makes using unicode easier.
All you need to do is download one header file, unicode_traits.hpp, and drop it somewhere in your include path.
Consult the unicode_traits reference for details.
In the examples below, the user's intentions for source and target encoding schemes are deduced from the character width, UTF-8 from 8 bit characters, UTF-16 from 16 bit characters, and UTF-32 from 32 bit characters. The character type may be any integral type, signed or unsigned, with size in bits of 8, 16 or 32.
#include "unicode_traits.hpp"
#include <vector>
#include <string>
#include <iterator>
int main()
{
std::string source = "Hello world \xf0\x9f\x99\x82";
// Convert source to UTF16
std::u16string target1;
auto result1 = unicons::convert(source.begin(),source.end(),
std::back_inserter(target1),
unicons::conv_flags::strict);
// Convert source to UTF32
std::vector<uint32_t> target2;
auto result2 = unicons::convert(source.begin(),source.end(),
std::back_inserter(target2),
unicons::conv_flags::strict);
// Convert source to UTF16 (if 16 bit wchar_t) or UTF32 (if 32 bit wchar_t)
wstring target3;
auto result3 = unicons::convert(source.begin(),source.end(),
std::back_inserter(target3),
unicons::conv_flags::strict);
}
Hello World 🙂
uint32_t cp = 0x1f642;
std::string target1 = "Hello world ";
std::u16string target2 = u"Hello world ";
std::u32string target3 = U"Hello world ";
std::wstring target4 = L"Hello world ";
auto result1 = unicons::convert(&cp,&cp + 1,std::back_inserter(target1),
unicons::conv_flags::strict);
auto result2 = unicons::convert(&cp,&cp + 1,std::back_inserter(target2),
unicons::conv_flags::strict);
auto result3 = unicons::convert(&cp,&cp + 1,std::back_inserter(target3),
unicons::conv_flags::strict);
auto result4 = unicons::convert(&cp,&cp + 1,std::back_inserter(target4),
unicons::conv_flags::strict);
Hello World 🙂
std::string source = "Hi \xf0\x9f\x99\x82"; // U+1F642
std::error_code ec;
auto it = unicons::make_codepoint_iterator(source.begin(),source.end(),ec);
auto last = end(it);
while (!ec && it != last)
{
uint32_t codepoint = *it;
it.increment(ec);
}
H
i
🙂
std::string source = "\xE6\x97\xA5\xD1\x88\xFA";
auto result = unicons::validate(source.begin(),source.end());
if (result.ec)
{
std::cout << make_error_code(result.ec).message() << std::endl;
}
Output:
Partial character in source, but hit end
std::u16string source = u"\xD888\x1234";
auto result = unicons::validate(source.begin(),source.end());
if (result.ec)
{
std::cout << make_error_code(result.ec).message() << std::endl;
}
Output:
Unpaired high surrogate UTF-16
unicode_traits
requires a C++11 compiler. It is tested in continuous integration on AppVeyor, Travis, and doozer.
UndefinedBehaviorSanitizer (UBSan) diagnostics are enabled for selected gcc and clang builds.
Compiler | Version | Architecture | Operating System |
---|---|---|---|
Microsoft Visual Studio | vs2015 (MSVC 19.0.24241.7) | x86,x64 | Windows 10 |
vs2017 | x86,x64 | Windows 10 | |
vs2019 | x86,x64 | Windows 10 | |
g++ | 4.8 and above | x64 | Ubuntu |
4.8.5 | x64 | CentOS 7.6 | |
6.3.1 (Red Hat 6.3.1-1) | x64 | Fedora release 24 | |
4.9.2 | i386 | Debian 8 | |
clang | 3.8 and above | x64 | Ubuntu |
clang xcode | 6.4 and above | x64 | OSX |