Skip to content

Commit

Permalink
refs #18, unicode error raise can now be enabled with define GHC_RAIS…
Browse files Browse the repository at this point in the history
…E_UNICODE_ERRORS
  • Loading branch information
gulrak committed Jun 2, 2019
1 parent 2969bad commit 98dad7f
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 3 deletions.
64 changes: 63 additions & 1 deletion include/ghc/filesystem.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,11 @@
// as ghc::filesystem::string_type.
// #define GHC_WIN_WSTRING_STRING_TYPE
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// Rais errors/exceptions when invalid unicode codepoints or UTF-8 sequences are found,
// instead of replacing them with the unicode replacement character (U+FFFD).
// #define GHC_RAISE_UNICODE_ERRORS
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

// ghc::filesystem version in decimal (major * 10000 + minor * 100 + patch)
#define GHC_FILESYSTEM_VERSION 10199L

Expand Down Expand Up @@ -1209,7 +1214,11 @@ GHC_INLINE void appendUTF8(std::string& str, uint32_t unicode)
str.push_back(static_cast<char>((unicode & 0x3f) + 128));
}
else {
#ifdef GHC_RAISE_UNICODE_ERRORS
throw filesystem_error("Illegal code point for unicode character.", str, std::make_error_code(std::errc::illegal_byte_sequence));
#else
appendUTF8(str, 0xfffd);
#endif
}
}

Expand All @@ -1228,6 +1237,22 @@ GHC_INLINE unsigned consumeUtf8Fragment(const unsigned state, const uint8_t frag
return state == S_RJCT ? static_cast<unsigned>(S_RJCT) : static_cast<unsigned>((utf8_state_info[category + 16] >> (state << 2)) & 0xf);
}

GHC_INLINE bool validUtf8(const std::string& utf8String)
{
std::string::const_iterator iter = utf8String.begin();
unsigned utf8_state = S_STRT;
std::uint32_t codepoint = 0;
while (iter < utf8String.end()) {
if ((utf8_state = consumeUtf8Fragment(utf8_state, (uint8_t)*iter++, codepoint)) == S_RJCT) {
return false;
}
}
if (utf8_state) {
return false;
}
return true;
}

} // namespace detail

#endif
Expand Down Expand Up @@ -1261,13 +1286,21 @@ inline StringType fromUtf8(const std::string& utf8String, const typename StringT
codepoint = 0;
}
else if (utf8_state == S_RJCT) {
#ifdef GHC_RAISE_UNICODE_ERRORS
throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
#else
result += (typename StringType::value_type)0xfffd;
utf8_state = S_STRT;
codepoint = 0;
#endif
}
}
if (utf8_state) {
#ifdef GHC_RAISE_UNICODE_ERRORS
throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
#else
result += (typename StringType::value_type)0xfffd;
#endif
}
return result;
}
Expand All @@ -1286,13 +1319,21 @@ inline StringType fromUtf8(const std::string& utf8String, const typename StringT
codepoint = 0;
}
else if (utf8_state == S_RJCT) {
#ifdef GHC_RAISE_UNICODE_ERRORS
throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
#else
result += (typename StringType::value_type)0xfffd;
utf8_state = S_STRT;
codepoint = 0;
#endif
}
}
if (utf8_state) {
#ifdef GHC_RAISE_UNICODE_ERRORS
throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
#else
result += (typename StringType::value_type)0xfffd;
#endif
}
return result;
}
Expand All @@ -1315,10 +1356,14 @@ inline std::string toUtf8(const std::basic_string<charT, traits, Alloc>& unicode
appendUTF8(result, (char32_t(c) << 10) + *iter - 0x35fdc00);
}
else {
#ifdef GHC_RAISE_UNICODE_ERRORS
throw filesystem_error("Illegal code point for unicode character.", result, std::make_error_code(std::errc::illegal_byte_sequence));
#else
appendUTF8(result, 0xfffd);
if(iter == unicodeString.end()) {
break;
}
#endif
}
}
else {
Expand Down Expand Up @@ -1359,6 +1404,13 @@ GHC_INLINE bool startsWith(const std::string& what, const std::string& with)

GHC_INLINE void path::postprocess_path_with_format(path::impl_string_type& p, path::format fmt)
{
#ifdef GHC_RAISE_UNICODE_ERRORS
if(!detail::validUtf8(p)) {
path t;
t._path = p;
throw filesystem_error("Illegal byte sequence for unicode character.", t, std::make_error_code(std::errc::illegal_byte_sequence));
}
#endif
switch (fmt) {
#ifndef GHC_OS_WINDOWS
case path::auto_format:
Expand Down Expand Up @@ -4658,10 +4710,20 @@ class directory_iterator::impl
do {
if (FindNextFileW(_dirHandle, &_findData)) {
_current = _base;
_current.append_name(detail::toUtf8(_findData.cFileName).c_str());
try {
_current.append_name(detail::toUtf8(_findData.cFileName).c_str());
}
catch(filesystem_error& fe) {
ec = fe.code();
return;
}
copyToDirEntry(ec);
}
else {
auto err = ::GetLastError();
if(err != ERROR_NO_MORE_FILES) {
_ec = ec = std::error_code(err, std::system_category());
}
FindClose(_dirHandle);
_dirHandle = INVALID_HANDLE_VALUE;
_current = filesystem::path();
Expand Down
14 changes: 12 additions & 2 deletions test/filesystem_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,18 +320,28 @@ TEST_CASE("fs::detail::fromUtf8", "[filesystem][fs.detail.utf8]")
CHECK(fs::detail::toUtf8(std::wstring(L"foobar")) == "foobar");
CHECK(fs::detail::toUtf8(std::wstring(L"föobar")).length() == 7);
CHECK(fs::detail::toUtf8(std::wstring(L"föobar")) == u8"föobar");


#ifdef GHC_RAISE_UNICODE_ERRORS
CHECK_THROWS_AS(fs::detail::fromUtf8<std::u16string>(std::string("\xed\xa0\x80")), fs::filesystem_error);
CHECK_THROWS_AS(fs::detail::fromUtf8<std::u16string>(std::string("\xc3")), fs::filesystem_error);
#else
CHECK(std::u16string(2,0xfffd) == fs::detail::fromUtf8<std::u16string>(std::string("\xed\xa0\x80")));
CHECK(std::u16string(1,0xfffd) == fs::detail::fromUtf8<std::u16string>(std::string("\xc3")));
#endif
}

TEST_CASE("fs::detail::toUtf8", "[filesystem][fs.detail.utf8]")
{
std::string t;
CHECK(std::string("\xc3\xa4/\xe2\x82\xac\xf0\x9d\x84\x9e") == fs::detail::toUtf8(std::u16string(u"\u00E4/\u20AC\U0001D11E")));
#ifdef GHC_RAISE_UNICODE_ERRORS
CHECK_THROWS_AS(fs::detail::toUtf8(std::u16string(1, 0xd800)), fs::filesystem_error);
CHECK_THROWS_AS(fs::detail::appendUTF8(t, 0x200000), fs::filesystem_error);
#else
CHECK(std::string("\xEF\xBF\xBD") == fs::detail::toUtf8(std::u16string(1, 0xd800)));
std::string t;
fs::detail::appendUTF8(t, 0x200000);
CHECK(std::string("\xEF\xBF\xBD") == t);
#endif
}
#endif

Expand Down

0 comments on commit 98dad7f

Please sign in to comment.