Skip to content

Commit

Permalink
Add support for JSON-compatible string escapes (#485)
Browse files Browse the repository at this point in the history
For completeness I've implemented escaping for characters outside the
basic multilingual plane, but it doesn't get used (as there's no
EscapeAsAsciiJson emitter option implemented).
  • Loading branch information
Ortham authored Jul 14, 2020
1 parent 370acee commit c82d312
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 18 deletions.
1 change: 1 addition & 0 deletions include/yaml-cpp/emittermanip.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ enum EMITTER_MANIP {
// output character set
EmitNonAscii,
EscapeNonAscii,
EscapeAsJson,

// string manipulators
// Auto, // duplicate
Expand Down
23 changes: 19 additions & 4 deletions src/emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -686,14 +686,27 @@ void Emitter::StartedScalar() { m_pState->StartedScalar(); }
// *******************************************************************************************
// overloads of Write

StringEscaping::value GetStringEscapingStyle(const EMITTER_MANIP emitterManip) {
switch (emitterManip) {
case EscapeNonAscii:
return StringEscaping::NonAscii;
case EscapeAsJson:
return StringEscaping::JSON;
default:
return StringEscaping::None;
break;
}
}

Emitter& Emitter::Write(const std::string& str) {
if (!good())
return *this;

const bool escapeNonAscii = m_pState->GetOutputCharset() == EscapeNonAscii;
StringEscaping::value stringEscaping = GetStringEscapingStyle(m_pState->GetOutputCharset());

const StringFormat::value strFormat =
Utils::ComputeStringFormat(str, m_pState->GetStringFormat(),
m_pState->CurGroupFlowType(), escapeNonAscii);
m_pState->CurGroupFlowType(), stringEscaping == StringEscaping::NonAscii);

if (strFormat == StringFormat::Literal)
m_pState->SetMapKeyFormat(YAML::LongKey, FmtScope::Local);
Expand All @@ -708,7 +721,7 @@ Emitter& Emitter::Write(const std::string& str) {
Utils::WriteSingleQuotedString(m_stream, str);
break;
case StringFormat::DoubleQuoted:
Utils::WriteDoubleQuotedString(m_stream, str, escapeNonAscii);
Utils::WriteDoubleQuotedString(m_stream, str, stringEscaping);
break;
case StringFormat::Literal:
Utils::WriteLiteralString(m_stream, str,
Expand Down Expand Up @@ -814,8 +827,10 @@ Emitter& Emitter::Write(char ch) {
if (!good())
return *this;



PrepareNode(EmitterNodeType::Scalar);
Utils::WriteChar(m_stream, ch);
Utils::WriteChar(m_stream, ch, GetStringEscapingStyle(m_pState->GetOutputCharset()));
StartedScalar();

return *this;
Expand Down
1 change: 1 addition & 0 deletions src/emitterstate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ bool EmitterState::SetOutputCharset(EMITTER_MANIP value,
switch (value) {
case EmitNonAscii:
case EscapeNonAscii:
case EscapeAsJson:
_Set(m_charset, value, scope);
return true;
default:
Expand Down
43 changes: 32 additions & 11 deletions src/emitterutils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,20 +218,34 @@ bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType,
});
}

void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint) {
std::pair<uint16_t, uint16_t> EncodeUTF16SurrogatePair(int codePoint) {
const uint32_t leadOffset = 0xD800 - (0x10000 >> 10);

return {
leadOffset | (codePoint >> 10),
0xDC00 | (codePoint & 0x3FF),
};
}

void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, StringEscaping::value stringEscapingStyle) {
static const char hexDigits[] = "0123456789abcdef";

out << "\\";
int digits = 8;
if (codePoint < 0xFF) {
if (codePoint < 0xFF && stringEscapingStyle != StringEscaping::JSON) {
out << "x";
digits = 2;
} else if (codePoint < 0xFFFF) {
out << "u";
digits = 4;
} else {
} else if (stringEscapingStyle != StringEscaping::JSON) {
out << "U";
digits = 8;
} else {
auto surrogatePair = EncodeUTF16SurrogatePair(codePoint);
WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, stringEscapingStyle);
WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, stringEscapingStyle);
return;
}

// Write digits into the escape sequence
Expand Down Expand Up @@ -303,7 +317,7 @@ bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) {
}

bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
bool escapeNonAscii) {
StringEscaping::value stringEscaping) {
out << "\"";
int codePoint;
for (std::string::const_iterator i = str.begin();
Expand All @@ -327,16 +341,19 @@ bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
case '\b':
out << "\\b";
break;
case '\f':
out << "\\f";
break;
default:
if (codePoint < 0x20 ||
(codePoint >= 0x80 &&
codePoint <= 0xA0)) { // Control characters and non-breaking space
WriteDoubleQuoteEscapeSequence(out, codePoint);
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
} else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be
// escaped (YAML 1.2, sec. 5.2)
WriteDoubleQuoteEscapeSequence(out, codePoint);
} else if (escapeNonAscii && codePoint > 0x7E) {
WriteDoubleQuoteEscapeSequence(out, codePoint);
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
} else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) {
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
} else {
WriteCodePoint(out, codePoint);
}
Expand All @@ -362,7 +379,7 @@ bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
return true;
}

bool WriteChar(ostream_wrapper& out, char ch) {
bool WriteChar(ostream_wrapper& out, char ch, StringEscaping::value stringEscapingStyle) {
if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) {
out << ch;
} else if (ch == '\"') {
Expand All @@ -373,13 +390,17 @@ bool WriteChar(ostream_wrapper& out, char ch) {
out << R"("\n")";
} else if (ch == '\b') {
out << R"("\b")";
} else if (ch == '\r') {
out << R"("\r")";
} else if (ch == '\f') {
out << R"("\f")";
} else if (ch == '\\') {
out << R"("\\")";
} else if (0x20 <= ch && ch <= 0x7e) {
out << "\"" << ch << "\"";
} else {
out << "\"";
WriteDoubleQuoteEscapeSequence(out, ch);
WriteDoubleQuoteEscapeSequence(out, ch, stringEscapingStyle);
out << "\"";
}
return true;
Expand Down Expand Up @@ -469,7 +490,7 @@ bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix,

bool WriteBinary(ostream_wrapper& out, const Binary& binary) {
WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()),
false);
StringEscaping::None);
return true;
}
} // namespace Utils
Expand Down
9 changes: 7 additions & 2 deletions src/emitterutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ struct StringFormat {
enum value { Plain, SingleQuoted, DoubleQuoted, Literal };
};

struct StringEscaping {
enum value { None, NonAscii, JSON };
};

namespace Utils {
StringFormat::value ComputeStringFormat(const std::string& str,
EMITTER_MANIP strFormat,
Expand All @@ -32,10 +36,11 @@ StringFormat::value ComputeStringFormat(const std::string& str,

bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str);
bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
bool escapeNonAscii);
StringEscaping::value stringEscaping);
bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
std::size_t indent);
bool WriteChar(ostream_wrapper& out, char ch);
bool WriteChar(ostream_wrapper& out, char ch,
StringEscaping::value stringEscapingStyle);
bool WriteComment(ostream_wrapper& out, const std::string& str,
std::size_t postCommentIndent);
bool WriteAlias(ostream_wrapper& out, const std::string& str);
Expand Down
38 changes: 37 additions & 1 deletion test/integration/emitter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,43 @@ TEST_F(EmitterTest, Unicode) {

TEST_F(EmitterTest, DoubleQuotedUnicode) {
out << DoubleQuoted << "\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2";
ExpectEmit("\"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\"");
ExpectEmit("\"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\"");
}

TEST_F(EmitterTest, EscapedJsonString) {
out.SetStringFormat(DoubleQuoted);
out.SetOutputCharset(EscapeAsJson);
out << "\" \\ "
"\x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0A \x0B \x0C \x0D \x0E \x0F "
"\x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A \x1B \x1C \x1D \x1E \x1F "
"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2";

ExpectEmit(R"("\" \\ \u0001 \u0002 \u0003 \u0004 \u0005 \u0006 \u0007 \b \t )"
R"(\n \u000b \f \r \u000e \u000f \u0010 \u0011 \u0012 \u0013 )"
R"(\u0014 \u0015 \u0016 \u0017 \u0018 \u0019 \u001a \u001b )"
R"(\u001c \u001d \u001e \u001f )"
"$ \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\"");
}

TEST_F(EmitterTest, EscapedCharacters) {
out << BeginSeq
<< '\x00'
<< '\x0C'
<< '\x0D'
<< EndSeq;

ExpectEmit("- \"\\x00\"\n- \"\\f\"\n- \"\\r\"");
}

TEST_F(EmitterTest, CharactersEscapedAsJson) {
out.SetOutputCharset(EscapeAsJson);
out << BeginSeq
<< '\x00'
<< '\x0C'
<< '\x0D'
<< EndSeq;

ExpectEmit("- \"\\u0000\"\n- \"\\f\"\n- \"\\r\"");
}

TEST_F(EmitterTest, DoubleQuotedString) {
Expand Down

0 comments on commit c82d312

Please sign in to comment.