From 1de3dff0fc681a23a9cf9285e4f3cbfcbb57ecc6 Mon Sep 17 00:00:00 2001 From: alandefreitas Date: Thu, 22 Feb 2024 17:18:24 -0300 Subject: [PATCH] fix: sanitize references --- src/lib/AST/ParseJavadoc.cpp | 158 +++++++++++++++++++++++++++++++--- src/lib/Gen/xml/XMLWriter.cpp | 60 +++++++++---- src/lib/Gen/xml/XMLWriter.hpp | 2 + 3 files changed, 195 insertions(+), 25 deletions(-) diff --git a/src/lib/AST/ParseJavadoc.cpp b/src/lib/AST/ParseJavadoc.cpp index d10bb94cf..4393729bd 100644 --- a/src/lib/AST/ParseJavadoc.cpp +++ b/src/lib/AST/ParseJavadoc.cpp @@ -6,6 +6,7 @@ // // Copyright (c) 2023 Vinnie Falco (vinnie.falco@gmail.com) // Copyright (c) 2023 Krystian Stasiowski (sdkrystian@gmail.com) +// Copyright (c) 2024 Alan de Freitas (alandefreitas@gmail.com) // // Official repository: https://github.com/cppalliance/mrdocs // @@ -497,7 +498,7 @@ parseHTMLTag(HTMLStartTagComment const* C) }) : it_; if (tagEndIt == end_) { - return Unexpected(Error(fmt::format("warning: HTML <{}> tag not followed by end tag", res.tag))); + return Unexpected(formatError("warning: HTML <{}> tag not followed by end tag", res.tag)); } // Check if end tag matches start tag @@ -660,6 +661,94 @@ convertDirection(ParamCommandComment::PassDirection kind) } } +/** Parse first chars of string that represent an identifier + */ +std::string_view +parseIdentifier(std::string_view str) +{ + static constexpr auto idChars = + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789" + "_"; + static constexpr auto operatorChars = + "~!%^&*()-+=|[]{};:,.<>?/"; + if (str.empty()) + { + return {}; + } + + std::size_t p = str.find_first_not_of(idChars); + if (p == std::string_view::npos) + { + return str; + } + + if (str.substr(0, p) == "operator") + { + p = str.find_first_not_of(operatorChars, p); + if (p == std::string_view::npos) + { + return str; + } + } + + return str.substr(0, p); +} + +/** Parse first chars of string that represent an identifier + */ +std::string_view +parseQualifiedIdentifier(std::string_view str) +{ + auto str0 = str; + std::size_t off = 0; + if (str.starts_with("::")) + { + off += 2; + str.remove_prefix(2); + } + + bool atIdentifier = true; + while (!str.empty()) + { + if (atIdentifier) + { + auto idStr = parseIdentifier(str); + if (!idStr.empty()) + { + off += idStr.size(); + str = str.substr(idStr.size()); + atIdentifier = false; + } + else + { + break; + } + } + else + { + // At delimiter + if (str.starts_with("::")) + { + off += 2; + str = str.substr(2); + atIdentifier = true; + } + else + { + break; + } + } + } + std::string_view result = str0.substr(0, off); + if (result.ends_with("::")) + { + result = result.substr(0, result.size() - 2); + } + return result; +} + void JavadocVisitor:: visitInlineCommandComment( @@ -672,9 +761,6 @@ visitInlineCommandComment( // VFALCO I'd like to know when this happens MRDOCS_ASSERT(cmd != nullptr); - // KRYSTIAN FIXME: the text for a copydoc/ref command - // should not include illegal characters - // (e.g. periods that occur after the symbol name) switch(unsigned ID = cmd->getID()) { // Emphasis @@ -699,11 +785,44 @@ visitInlineCommandComment( { if(! goodArgCount(1, *C)) return; + // the referenced symbol will be resolved during // the finalization step once all symbols are extracted + std::string const &s = C->getArgText(0).str(); + bool const copyingFunctionDoc = s.find('(') != std::string::npos; + std::string ref = s; + if (copyingFunctionDoc) + { + // Clang parses the copydoc command breaking + // before the complete overload information. For instance, + // `@copydoc operator()(unsigned char) const` will create + // a node with the text `operator()(unsigned` and another + // with `char) const`. We need to merge these nodes. + std::size_t open = std::ranges::count(s, '('); + std::size_t close = std::ranges::count(s, ')'); + while (open != close) + { + ++it_; + if (it_ == end_) + { + break; + } + auto const* c = *it_; + if (c->getCommentKind() == Comment::TextCommentKind) + { + ref += static_cast(c)->getText(); + } + else + { + break; + } + open = std::ranges::count(ref, '('); + close = std::ranges::count(ref, ')'); + } + } emplaceText( C->hasTrailingNewline(), - C->getArgText(0).str(), + ref, convertCopydoc(ID)); return; } @@ -711,11 +830,30 @@ visitInlineCommandComment( { if(! goodArgCount(1, *C)) return; - // the referenced symbol will be resolved during - // the finalization step once all symbols are extracted - emplaceText( - C->hasTrailingNewline(), - C->getArgText(0).str()); + // The parsed reference often includes characters + // that are not valid in identifiers, so we need to + // clean it up. + // Find the first character that is not a valid C++ + // identifier character, and truncate the string there. + // This potentially creates two text nodes. + auto const s = C->getArgText(0).str(); + std::string_view ref = parseQualifiedIdentifier(s); + bool const hasExtraText = ref.size() != s.size(); + if (!ref.empty()) + { + // the referenced symbol will be resolved during + // the finalization step once all symbols are extracted + emplaceText( + C->hasTrailingNewline() && !hasExtraText, + std::string(ref)); + } + // Emplace the rest of the string as doc::Text + if(hasExtraText) + { + emplaceText( + C->hasTrailingNewline(), + s.substr(ref.size())); + } return; } diff --git a/src/lib/Gen/xml/XMLWriter.cpp b/src/lib/Gen/xml/XMLWriter.cpp index 76827b609..0a89e9196 100644 --- a/src/lib/Gen/xml/XMLWriter.cpp +++ b/src/lib/Gen/xml/XMLWriter.cpp @@ -622,49 +622,55 @@ writeNode( switch(node.kind) { case doc::Kind::text: - writeText(static_cast(node)); + writeText(dynamic_cast(node)); break; case doc::Kind::styled: - writeStyledText(static_cast(node)); + writeStyledText(dynamic_cast(node)); break; case doc::Kind::heading: - writeHeading(static_cast(node)); + writeHeading(dynamic_cast(node)); break; case doc::Kind::paragraph: - writeParagraph(static_cast(node)); + writeParagraph(dynamic_cast(node)); break; case doc::Kind::link: - writeLink(static_cast(node)); + writeLink(dynamic_cast(node)); break; case doc::Kind::list_item: - writeListItem(static_cast(node)); + writeListItem(dynamic_cast(node)); break; case doc::Kind::brief: - writeBrief(static_cast(node)); + writeBrief(dynamic_cast(node)); break; case doc::Kind::admonition: - writeAdmonition(static_cast(node)); + writeAdmonition(dynamic_cast(node)); break; case doc::Kind::code: - writeCode(static_cast(node)); + writeCode(dynamic_cast(node)); break; case doc::Kind::param: - writeJParam(static_cast(node)); + writeJParam(dynamic_cast(node)); break; case doc::Kind::tparam: - writeTParam(static_cast(node)); + writeTParam(dynamic_cast(node)); break; case doc::Kind::returns: - writeReturns(static_cast(node)); + writeReturns(dynamic_cast(node)); break; case doc::Kind::reference: - writeReference(static_cast(node)); + writeReference(dynamic_cast(node)); break; case doc::Kind::copied: - writeCopied(static_cast(node)); + writeCopied(dynamic_cast(node)); break; case doc::Kind::throws: - writeThrows(static_cast(node)); + writeThrows(dynamic_cast(node)); + break; + case doc::Kind::details: + writeDetails(dynamic_cast(node)); + break; + case doc::Kind::see: + writeSee(dynamic_cast(node)); break; default: // unknown kind @@ -776,6 +782,30 @@ writeParagraph( tags_.close("para"); } +void +XMLWriter:: +writeDetails( + doc::Details const& para, + llvm::StringRef tag) +{ + tags_.open("details", { + { "class", tag, ! tag.empty() }}); + writeNodes(para.children); + tags_.close("details"); +} + +void +XMLWriter:: +writeSee( + doc::See const& para, + llvm::StringRef tag) +{ + tags_.open("see", { + { "class", tag, ! tag.empty() }}); + writeNodes(para.children); + tags_.close("see"); +} + void XMLWriter:: writeAdmonition( diff --git a/src/lib/Gen/xml/XMLWriter.hpp b/src/lib/Gen/xml/XMLWriter.hpp index 61a320413..bc043984c 100644 --- a/src/lib/Gen/xml/XMLWriter.hpp +++ b/src/lib/Gen/xml/XMLWriter.hpp @@ -97,6 +97,8 @@ class XMLWriter void writeReference(doc::Reference const& node); void writeCopied(doc::Copied const& node); void writeThrows(doc::Throws const& node); + void writeDetails(doc::Details const& node, llvm::StringRef tag = ""); + void writeSee(doc::See const& node, llvm::StringRef tag = ""); }; } // xml