From d91229c1e1df1ea706b9c93d79ffcf3dd863455b Mon Sep 17 00:00:00 2001 From: Dave Cridland Date: Wed, 6 Nov 2024 17:23:25 +0000 Subject: [PATCH] Use buffer_ptr inside predicate skips While I can't reproduce this bug in the test suite, I suspect that in some cases the value decoding will run over the end of the buffer. This is because the buffer is not NUL terminated, and the code was originally expecting to perform this in-situ on the original buffer, which was both NUL-terminated and within known delimiters. --- rapidxml.hpp | 36 ++++++++++++++++-------------------- test/low-level-parse.cpp | 12 ++++++------ test/parse-simple.cpp | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 26 deletions(-) diff --git a/rapidxml.hpp b/rapidxml.hpp index 67e17c3..365e929 100644 --- a/rapidxml.hpp +++ b/rapidxml.hpp @@ -1679,18 +1679,17 @@ namespace rapidxml template view_type decode_data_value_low(view_type const & v) { - auto * init = v.data(); - auto * first = init; + buffer_ptr first{v}; if (Flags & parse_normalize_whitespace) { skip(first); } else { skip(first); } - if (*first == '<') return v; - auto buf = this->allocate_span(v); + if (!*first) return v; + auto buf = this->allocate_string(v); auto * start = buf.data(); - auto * tmp = start; - auto * end = (Flags & parse_normalize_whitespace) ? + buffer_ptr tmp{buf}; + auto end = (Flags & parse_normalize_whitespace) ? skip_and_expand_character_refs(tmp) : skip_and_expand_character_refs(tmp); // Trim trailing whitespace if flag is set; leading was already trimmed by whitespace skip after > @@ -1715,14 +1714,13 @@ namespace rapidxml template view_type decode_attr_value_low(view_type const & v) { - Ch const * init = v.data(); - Ch const * first = init; + buffer_ptr first{v}; skip,0>(first); - if (*first == Q) return v; - auto buf = this->allocate_span(v); - Ch * start = buf.data(); - Ch * tmp = start; - Ch * end = skip_and_expand_character_refs,attribute_value_pure_pred,0>(tmp); + if (!*first || *first == Q) return v; + auto buf = this->allocate_string(v); + const Ch * start = buf.data(); + buffer_ptr tmp{buf}; + const Ch * end = skip_and_expand_character_refs,attribute_value_pure_pred,0>(tmp); return {start, end}; } @@ -1922,8 +1920,8 @@ namespace rapidxml // Skip characters until predicate evaluates to true while doing the following: // - replacing XML character entity references with proper characters (' & " < > &#...;) // - condensing whitespace sequences to single space character - template - static Ch *skip_and_expand_character_refs(Ch *&text) + template + static const Ch *skip_and_expand_character_refs(Chp text) { // If entity translation, whitespace condense and whitespace trimming is disabled, use plain skip if (Flags & parse_no_entity_translation && @@ -1931,15 +1929,15 @@ namespace rapidxml !(Flags & parse_trim_whitespace)) { skip(text); - return text; + return &*text; } // Use simple skip until first modification is detected skip(text); // Use translation skip - Ch *src = text; - Ch *dest = src; + Chp src = text; + Ch * dest = const_cast(&*src); while (StopPred::test(*src)) { // If entity translation is enabled @@ -2063,9 +2061,7 @@ namespace rapidxml } // Return new end - text = src; return dest; - } /////////////////////////////////////////////////////////////////////// diff --git a/test/low-level-parse.cpp b/test/low-level-parse.cpp index a5f3719..df2fc75 100644 --- a/test/low-level-parse.cpp +++ b/test/low-level-parse.cpp @@ -35,31 +35,31 @@ TEST(PredicateBuffer, Skip) { TEST(Predicates, SkipAndExpand) { std::string test_data{"&hello;<"}; char * start = const_cast(test_data.c_str()); - start = rapidxml::xml_document<>::skip_and_expand_character_refs< + auto end = rapidxml::xml_document<>::skip_and_expand_character_refs< rapidxml::xml_document<>::text_pred, rapidxml::xml_document<>::text_pure_with_ws_pred, rapidxml::parse_no_entity_translation>(start); - EXPECT_EQ(*start, '<'); + EXPECT_EQ(*end, '<'); } TEST(Predicates, SkipAndExpandShort) { std::string test_data{"&hello;"}; char * start = const_cast(test_data.c_str()); - start = rapidxml::xml_document<>::skip_and_expand_character_refs< + auto end = rapidxml::xml_document<>::skip_and_expand_character_refs< rapidxml::xml_document<>::text_pred, rapidxml::xml_document<>::text_pure_with_ws_pred, rapidxml::parse_no_entity_translation>(start); - EXPECT_EQ(*start, '\0'); + EXPECT_EQ(*end, '\0'); } TEST(Predicates, SkipAndExpandShorter) { std::string test_data{"&hell"}; char * start = const_cast(test_data.c_str()); - start = rapidxml::xml_document<>::skip_and_expand_character_refs< + auto end = rapidxml::xml_document<>::skip_and_expand_character_refs< rapidxml::xml_document<>::text_pred, rapidxml::xml_document<>::text_pure_with_ws_pred, rapidxml::parse_no_entity_translation>(start); - EXPECT_EQ(*start, '\0'); + EXPECT_EQ(*end, '\0'); } TEST(ParseFns, ParseBom) { diff --git a/test/parse-simple.cpp b/test/parse-simple.cpp index 6e36e56..dd2c64b 100644 --- a/test/parse-simple.cpp +++ b/test/parse-simple.cpp @@ -264,3 +264,42 @@ TEST(ParseOptions, OpenOnlyFastest) { subdoc.validate(); } } + +TEST(Parser_Emoji, Single) { + std::string foo{"'"}; + rapidxml::xml_document<> doc; + doc.parse(foo); + EXPECT_EQ("'", doc.first_node()->value()); +} + +TEST(Parser_Emoji, SingleUni) { + std::string foo{"Ӓ"}; + rapidxml::xml_document<> doc; + doc.parse(foo); + EXPECT_EQ("\xD3\x92", doc.first_node()->value()); +} + +TEST(Parser_Emoji, SingleEmoji) { + std::string foo{"😀"}; + rapidxml::xml_document<> doc; + doc.parse(foo); + EXPECT_EQ("\xF0\x9F\x98\x80", doc.first_node()->value()); + EXPECT_EQ(4, doc.first_node()->value().size()); +} + +TEST(Parser_Emoji, SingleEmojiReuse) { + std::string bar("Sir I bear a rhyme excelling in mystic verse and magic spelling 😀"); + rapidxml::xml_document<> doc; + rapidxml::xml_document<> parent_doc; + parent_doc.parse(""); + doc.parse(bar, &parent_doc); + EXPECT_EQ("Sir I bear a rhyme excelling in mystic verse and magic spelling \xF0\x9F\x98\x80", doc.first_node()->value()); + auto doc_a = doc.first_node()->document(); + doc.first_node()->value(doc_a->allocate_string("Sausages are the loneliest fruit, and are but one of the strange things I have witnessed in my long and interesting life.")); + EXPECT_EQ("Sausages are the loneliest fruit, and are but one of the strange things I have witnessed in my long and interesting life.", doc.first_node()->value()); + bar = "😀"; + doc.parse(bar, &parent_doc); + EXPECT_EQ("\xF0\x9F\x98\x80", doc.first_node()->value()); + EXPECT_EQ(4, doc.first_node()->value().size()); +} +