fix: sanitize references

cppalliance · Feb 29, 2024 · 1de3dff · 1de3dff
1 parent 541cf5b
commit 1de3dff
Show file tree

Hide file tree

Showing 3 changed files with 195 additions and 25 deletions.
diff --git a/src/lib/AST/ParseJavadoc.cpp b/src/lib/AST/ParseJavadoc.cpp
@@ -6,6 +6,7 @@
 //
 // Copyright (c) 2023 Vinnie Falco ([email protected])
 // Copyright (c) 2023 Krystian Stasiowski ([email protected])
+// Copyright (c) 2024 Alan de Freitas ([email protected])
 //
 // Official repository: https://github.com/cppalliance/mrdocs
 //
@@ -497,7 +498,7 @@ parseHTMLTag(HTMLStartTagComment const* C)
         }) : it_;
     if (tagEndIt == end_)
     {
-        return Unexpected(Error(fmt::format("warning: HTML <{}> tag not followed by end tag", res.tag)));
+        return Unexpected(formatError("warning: HTML <{}> tag not followed by end tag", res.tag));
     }
 
     // Check if end tag matches start tag
@@ -660,6 +661,94 @@ convertDirection(ParamCommandComment::PassDirection kind)
     }
 }
 
+/** Parse first chars of string that represent an identifier
+ */
+std::string_view
+parseIdentifier(std::string_view str)
+{
+    static constexpr auto idChars =
+        "abcdefghijklmnopqrstuvwxyz"
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+        "0123456789"
+        "_";
+    static constexpr auto operatorChars =
+        "~!%^&*()-+=|[]{};:,.<>?/";
+    if (str.empty())
+    {
+        return {};
+    }
+
+    std::size_t p = str.find_first_not_of(idChars);
+    if (p == std::string_view::npos)
+    {
+        return str;
+    }
+
+    if (str.substr(0, p) == "operator")
+    {
+        p = str.find_first_not_of(operatorChars, p);
+        if (p == std::string_view::npos)
+        {
+            return str;
+        }
+    }
+
+    return str.substr(0, p);
+}
+
+/** Parse first chars of string that represent an identifier
+ */
+std::string_view
+parseQualifiedIdentifier(std::string_view str)
+{
+    auto str0 = str;
+    std::size_t off = 0;
+    if (str.starts_with("::"))
+    {
+        off += 2;
+        str.remove_prefix(2);
+    }
+
+    bool atIdentifier = true;
+    while (!str.empty())
+    {
+        if (atIdentifier)
+        {
+            auto idStr = parseIdentifier(str);
+            if (!idStr.empty())
+            {
+                off += idStr.size();
+                str = str.substr(idStr.size());
+                atIdentifier = false;
+            }
+            else
+            {
+                break;
+            }
+        }
+        else
+        {
+            // At delimiter
+            if (str.starts_with("::"))
+            {
+                off += 2;
+                str = str.substr(2);
+                atIdentifier = true;
+            }
+            else
+            {
+                break;
+            }
+        }
+    }
+    std::string_view result = str0.substr(0, off);
+    if (result.ends_with("::"))
+    {
+        result = result.substr(0, result.size() - 2);
+    }
+    return result;
+}
+
 void
 JavadocVisitor::
 visitInlineCommandComment(
@@ -672,9 +761,6 @@ visitInlineCommandComment(
     // VFALCO I'd like to know when this happens
     MRDOCS_ASSERT(cmd != nullptr);
 
-    // KRYSTIAN FIXME: the text for a copydoc/ref command
-    // should not include illegal characters
-    // (e.g. periods that occur after the symbol name)
     switch(unsigned ID = cmd->getID())
     {
     // Emphasis
@@ -699,23 +785,75 @@ visitInlineCommandComment(
     {
         if(! goodArgCount(1, *C))
             return;
+
         // the referenced symbol will be resolved during
         // the finalization step once all symbols are extracted
+        std::string const &s = C->getArgText(0).str();
+        bool const copyingFunctionDoc = s.find('(') != std::string::npos;
+        std::string ref = s;
+        if (copyingFunctionDoc)
+        {
+            // Clang parses the copydoc command breaking
+            // before the complete overload information. For instance,
+            // `@copydoc operator()(unsigned char) const` will create
+            // a node with the text `operator()(unsigned` and another
+            // with `char) const`. We need to merge these nodes.
+            std::size_t open = std::ranges::count(s, '(');
+            std::size_t close = std::ranges::count(s, ')');
+            while (open != close)
+            {
+                ++it_;
+                if (it_ == end_)
+                {
+                    break;
+                }
+                auto const* c = *it_;
+                if (c->getCommentKind() == Comment::TextCommentKind)
+                {
+                    ref += static_cast<TextComment const*>(c)->getText();
+                }
+                else
+                {
+                    break;
+                }
+                open = std::ranges::count(ref, '(');
+                close = std::ranges::count(ref, ')');
+            }
+        }
         emplaceText<doc::Copied>(
             C->hasTrailingNewline(),
-            C->getArgText(0).str(),
+            ref,
             convertCopydoc(ID));
         return;
     }
     case CommandTraits::KCI_ref:
     {
         if(! goodArgCount(1, *C))
             return;
-        // the referenced symbol will be resolved during
-        // the finalization step once all symbols are extracted
-        emplaceText<doc::Reference>(
-            C->hasTrailingNewline(),
-            C->getArgText(0).str());
+        // The parsed reference often includes characters
+        // that are not valid in identifiers, so we need to
+        // clean it up.
+        // Find the first character that is not a valid C++
+        // identifier character, and truncate the string there.
+        // This potentially creates two text nodes.
+        auto const s = C->getArgText(0).str();
+        std::string_view ref = parseQualifiedIdentifier(s);
+        bool const hasExtraText = ref.size() != s.size();
+        if (!ref.empty())
+        {
+            // the referenced symbol will be resolved during
+            // the finalization step once all symbols are extracted
+            emplaceText<doc::Reference>(
+                C->hasTrailingNewline() && !hasExtraText,
+                std::string(ref));
+        }
+        // Emplace the rest of the string as doc::Text
+        if(hasExtraText)
+        {
+            emplaceText<doc::Text>(
+                C->hasTrailingNewline(),
+                s.substr(ref.size()));
+        }
         return;
     }
 

diff --git a/src/lib/Gen/xml/XMLWriter.cpp b/src/lib/Gen/xml/XMLWriter.cpp
@@ -622,49 +622,55 @@ writeNode(
     switch(node.kind)
     {
     case doc::Kind::text:
-        writeText(static_cast<doc::Text const&>(node));
+        writeText(dynamic_cast<doc::Text const&>(node));
         break;
     case doc::Kind::styled:
-        writeStyledText(static_cast<doc::Styled const&>(node));
+        writeStyledText(dynamic_cast<doc::Styled const&>(node));
         break;
     case doc::Kind::heading:
-        writeHeading(static_cast<doc::Heading const&>(node));
+        writeHeading(dynamic_cast<doc::Heading const&>(node));
         break;
     case doc::Kind::paragraph:
-        writeParagraph(static_cast<doc::Paragraph const&>(node));
+        writeParagraph(dynamic_cast<doc::Paragraph const&>(node));
         break;
     case doc::Kind::link:
-        writeLink(static_cast<doc::Link const&>(node));
+        writeLink(dynamic_cast<doc::Link const&>(node));
         break;
     case doc::Kind::list_item:
-        writeListItem(static_cast<doc::ListItem const&>(node));
+        writeListItem(dynamic_cast<doc::ListItem const&>(node));
         break;
     case doc::Kind::brief:
-        writeBrief(static_cast<doc::Brief const&>(node));
+        writeBrief(dynamic_cast<doc::Brief const&>(node));
         break;
     case doc::Kind::admonition:
-        writeAdmonition(static_cast<doc::Admonition const&>(node));
+        writeAdmonition(dynamic_cast<doc::Admonition const&>(node));
         break;
     case doc::Kind::code:
-        writeCode(static_cast<doc::Code const&>(node));
+        writeCode(dynamic_cast<doc::Code const&>(node));
         break;
     case doc::Kind::param:
-        writeJParam(static_cast<doc::Param const&>(node));
+        writeJParam(dynamic_cast<doc::Param const&>(node));
         break;
     case doc::Kind::tparam:
-        writeTParam(static_cast<doc::TParam const&>(node));
+        writeTParam(dynamic_cast<doc::TParam const&>(node));
         break;
     case doc::Kind::returns:
-        writeReturns(static_cast<doc::Returns const&>(node));
+        writeReturns(dynamic_cast<doc::Returns const&>(node));
         break;
     case doc::Kind::reference:
-        writeReference(static_cast<doc::Reference const&>(node));
+        writeReference(dynamic_cast<doc::Reference const&>(node));
         break;
     case doc::Kind::copied:
-        writeCopied(static_cast<doc::Copied const&>(node));
+        writeCopied(dynamic_cast<doc::Copied const&>(node));
         break;
     case doc::Kind::throws:
-        writeThrows(static_cast<doc::Throws const&>(node));
+        writeThrows(dynamic_cast<doc::Throws const&>(node));
+        break;
+    case doc::Kind::details:
+        writeDetails(dynamic_cast<doc::Details const&>(node));
+        break;
+    case doc::Kind::see:
+        writeSee(dynamic_cast<doc::See const&>(node));
         break;
     default:
         // unknown kind
@@ -776,6 +782,30 @@ writeParagraph(
     tags_.close("para");
 }
 
+void
+XMLWriter::
+writeDetails(
+    doc::Details const& para,
+    llvm::StringRef tag)
+{
+    tags_.open("details", {
+        { "class", tag, ! tag.empty() }});
+    writeNodes(para.children);
+    tags_.close("details");
+}
+
+void
+XMLWriter::
+writeSee(
+    doc::See const& para,
+    llvm::StringRef tag)
+{
+    tags_.open("see", {
+        { "class", tag, ! tag.empty() }});
+    writeNodes(para.children);
+    tags_.close("see");
+}
+
 void
 XMLWriter::
 writeAdmonition(

diff --git a/src/lib/Gen/xml/XMLWriter.hpp b/src/lib/Gen/xml/XMLWriter.hpp
@@ -97,6 +97,8 @@ class XMLWriter
     void writeReference(doc::Reference const& node);
     void writeCopied(doc::Copied const& node);
     void writeThrows(doc::Throws const& node);
+    void writeDetails(doc::Details const& node, llvm::StringRef tag = "");
+    void writeSee(doc::See const& node, llvm::StringRef tag = "");
 };
 
 } // xml