From c82d3129ddbb531d261e995d43999f9cd041a3bd Mon Sep 17 00:00:00 2001 From: Oliver Hamlet Date: Tue, 14 Jul 2020 03:16:34 +0100 Subject: [PATCH] Add support for JSON-compatible string escapes (#485) For completeness I've implemented escaping for characters outside the basic multilingual plane, but it doesn't get used (as there's no EscapeAsAsciiJson emitter option implemented). --- include/yaml-cpp/emittermanip.h | 1 + src/emitter.cpp | 23 ++++++++++++++--- src/emitterstate.cpp | 1 + src/emitterutils.cpp | 43 +++++++++++++++++++++++-------- src/emitterutils.h | 9 +++++-- test/integration/emitter_test.cpp | 38 ++++++++++++++++++++++++++- 6 files changed, 97 insertions(+), 18 deletions(-) diff --git a/include/yaml-cpp/emittermanip.h b/include/yaml-cpp/emittermanip.h index 010d8609f..976d14950 100644 --- a/include/yaml-cpp/emittermanip.h +++ b/include/yaml-cpp/emittermanip.h @@ -19,6 +19,7 @@ enum EMITTER_MANIP { // output character set EmitNonAscii, EscapeNonAscii, + EscapeAsJson, // string manipulators // Auto, // duplicate diff --git a/src/emitter.cpp b/src/emitter.cpp index cf093e00f..327b1ce32 100644 --- a/src/emitter.cpp +++ b/src/emitter.cpp @@ -686,14 +686,27 @@ void Emitter::StartedScalar() { m_pState->StartedScalar(); } // ******************************************************************************************* // overloads of Write +StringEscaping::value GetStringEscapingStyle(const EMITTER_MANIP emitterManip) { + switch (emitterManip) { + case EscapeNonAscii: + return StringEscaping::NonAscii; + case EscapeAsJson: + return StringEscaping::JSON; + default: + return StringEscaping::None; + break; + } +} + Emitter& Emitter::Write(const std::string& str) { if (!good()) return *this; - const bool escapeNonAscii = m_pState->GetOutputCharset() == EscapeNonAscii; + StringEscaping::value stringEscaping = GetStringEscapingStyle(m_pState->GetOutputCharset()); + const StringFormat::value strFormat = Utils::ComputeStringFormat(str, m_pState->GetStringFormat(), - m_pState->CurGroupFlowType(), escapeNonAscii); + m_pState->CurGroupFlowType(), stringEscaping == StringEscaping::NonAscii); if (strFormat == StringFormat::Literal) m_pState->SetMapKeyFormat(YAML::LongKey, FmtScope::Local); @@ -708,7 +721,7 @@ Emitter& Emitter::Write(const std::string& str) { Utils::WriteSingleQuotedString(m_stream, str); break; case StringFormat::DoubleQuoted: - Utils::WriteDoubleQuotedString(m_stream, str, escapeNonAscii); + Utils::WriteDoubleQuotedString(m_stream, str, stringEscaping); break; case StringFormat::Literal: Utils::WriteLiteralString(m_stream, str, @@ -814,8 +827,10 @@ Emitter& Emitter::Write(char ch) { if (!good()) return *this; + + PrepareNode(EmitterNodeType::Scalar); - Utils::WriteChar(m_stream, ch); + Utils::WriteChar(m_stream, ch, GetStringEscapingStyle(m_pState->GetOutputCharset())); StartedScalar(); return *this; diff --git a/src/emitterstate.cpp b/src/emitterstate.cpp index 40497f795..70f937d8a 100644 --- a/src/emitterstate.cpp +++ b/src/emitterstate.cpp @@ -231,6 +231,7 @@ bool EmitterState::SetOutputCharset(EMITTER_MANIP value, switch (value) { case EmitNonAscii: case EscapeNonAscii: + case EscapeAsJson: _Set(m_charset, value, scope); return true; default: diff --git a/src/emitterutils.cpp b/src/emitterutils.cpp index 0410f931e..0113c454c 100644 --- a/src/emitterutils.cpp +++ b/src/emitterutils.cpp @@ -218,20 +218,34 @@ bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType, }); } -void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint) { +std::pair EncodeUTF16SurrogatePair(int codePoint) { + const uint32_t leadOffset = 0xD800 - (0x10000 >> 10); + + return { + leadOffset | (codePoint >> 10), + 0xDC00 | (codePoint & 0x3FF), + }; +} + +void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, StringEscaping::value stringEscapingStyle) { static const char hexDigits[] = "0123456789abcdef"; out << "\\"; int digits = 8; - if (codePoint < 0xFF) { + if (codePoint < 0xFF && stringEscapingStyle != StringEscaping::JSON) { out << "x"; digits = 2; } else if (codePoint < 0xFFFF) { out << "u"; digits = 4; - } else { + } else if (stringEscapingStyle != StringEscaping::JSON) { out << "U"; digits = 8; + } else { + auto surrogatePair = EncodeUTF16SurrogatePair(codePoint); + WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, stringEscapingStyle); + WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, stringEscapingStyle); + return; } // Write digits into the escape sequence @@ -303,7 +317,7 @@ bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) { } bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str, - bool escapeNonAscii) { + StringEscaping::value stringEscaping) { out << "\""; int codePoint; for (std::string::const_iterator i = str.begin(); @@ -327,16 +341,19 @@ bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str, case '\b': out << "\\b"; break; + case '\f': + out << "\\f"; + break; default: if (codePoint < 0x20 || (codePoint >= 0x80 && codePoint <= 0xA0)) { // Control characters and non-breaking space - WriteDoubleQuoteEscapeSequence(out, codePoint); + WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping); } else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be // escaped (YAML 1.2, sec. 5.2) - WriteDoubleQuoteEscapeSequence(out, codePoint); - } else if (escapeNonAscii && codePoint > 0x7E) { - WriteDoubleQuoteEscapeSequence(out, codePoint); + WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping); + } else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) { + WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping); } else { WriteCodePoint(out, codePoint); } @@ -362,7 +379,7 @@ bool WriteLiteralString(ostream_wrapper& out, const std::string& str, return true; } -bool WriteChar(ostream_wrapper& out, char ch) { +bool WriteChar(ostream_wrapper& out, char ch, StringEscaping::value stringEscapingStyle) { if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) { out << ch; } else if (ch == '\"') { @@ -373,13 +390,17 @@ bool WriteChar(ostream_wrapper& out, char ch) { out << R"("\n")"; } else if (ch == '\b') { out << R"("\b")"; + } else if (ch == '\r') { + out << R"("\r")"; + } else if (ch == '\f') { + out << R"("\f")"; } else if (ch == '\\') { out << R"("\\")"; } else if (0x20 <= ch && ch <= 0x7e) { out << "\"" << ch << "\""; } else { out << "\""; - WriteDoubleQuoteEscapeSequence(out, ch); + WriteDoubleQuoteEscapeSequence(out, ch, stringEscapingStyle); out << "\""; } return true; @@ -469,7 +490,7 @@ bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix, bool WriteBinary(ostream_wrapper& out, const Binary& binary) { WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()), - false); + StringEscaping::None); return true; } } // namespace Utils diff --git a/src/emitterutils.h b/src/emitterutils.h index 6cc731914..3a7d59825 100644 --- a/src/emitterutils.h +++ b/src/emitterutils.h @@ -24,6 +24,10 @@ struct StringFormat { enum value { Plain, SingleQuoted, DoubleQuoted, Literal }; }; +struct StringEscaping { + enum value { None, NonAscii, JSON }; +}; + namespace Utils { StringFormat::value ComputeStringFormat(const std::string& str, EMITTER_MANIP strFormat, @@ -32,10 +36,11 @@ StringFormat::value ComputeStringFormat(const std::string& str, bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str); bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str, - bool escapeNonAscii); + StringEscaping::value stringEscaping); bool WriteLiteralString(ostream_wrapper& out, const std::string& str, std::size_t indent); -bool WriteChar(ostream_wrapper& out, char ch); +bool WriteChar(ostream_wrapper& out, char ch, + StringEscaping::value stringEscapingStyle); bool WriteComment(ostream_wrapper& out, const std::string& str, std::size_t postCommentIndent); bool WriteAlias(ostream_wrapper& out, const std::string& str); diff --git a/test/integration/emitter_test.cpp b/test/integration/emitter_test.cpp index 4cae36fef..285d0dee1 100644 --- a/test/integration/emitter_test.cpp +++ b/test/integration/emitter_test.cpp @@ -813,7 +813,43 @@ TEST_F(EmitterTest, Unicode) { TEST_F(EmitterTest, DoubleQuotedUnicode) { out << DoubleQuoted << "\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2"; - ExpectEmit("\"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\""); + ExpectEmit("\"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\""); +} + +TEST_F(EmitterTest, EscapedJsonString) { + out.SetStringFormat(DoubleQuoted); + out.SetOutputCharset(EscapeAsJson); + out << "\" \\ " + "\x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0A \x0B \x0C \x0D \x0E \x0F " + "\x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A \x1B \x1C \x1D \x1E \x1F " + "\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2"; + + ExpectEmit(R"("\" \\ \u0001 \u0002 \u0003 \u0004 \u0005 \u0006 \u0007 \b \t )" + R"(\n \u000b \f \r \u000e \u000f \u0010 \u0011 \u0012 \u0013 )" + R"(\u0014 \u0015 \u0016 \u0017 \u0018 \u0019 \u001a \u001b )" + R"(\u001c \u001d \u001e \u001f )" + "$ \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\""); +} + +TEST_F(EmitterTest, EscapedCharacters) { + out << BeginSeq + << '\x00' + << '\x0C' + << '\x0D' + << EndSeq; + + ExpectEmit("- \"\\x00\"\n- \"\\f\"\n- \"\\r\""); +} + +TEST_F(EmitterTest, CharactersEscapedAsJson) { + out.SetOutputCharset(EscapeAsJson); + out << BeginSeq + << '\x00' + << '\x0C' + << '\x0D' + << EndSeq; + + ExpectEmit("- \"\\u0000\"\n- \"\\f\"\n- \"\\r\""); } TEST_F(EmitterTest, DoubleQuotedString) {