From 1671e39943cedd001427076df92adafe8a91d1f8 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 18:15:16 -0500 Subject: [PATCH 01/30] Move constants into scope for test-prefix-tree.cpp. --- tests/test-prefix-tree.cpp | 63 ++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 27c7988..66d8f8a 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -10,22 +10,11 @@ using log_surgeon::finite_automata::PrefixTree; using id_t = PrefixTree::id_t; using position_t = PrefixTree::position_t; -constexpr auto cRootId{PrefixTree::cRootId}; -constexpr id_t cInvalidNodeId{100}; -constexpr position_t cInsertPos1{4}; -constexpr position_t cInsertPos2{7}; -constexpr position_t cInsertPos3{9}; -constexpr position_t cMaxPos{std::numeric_limits::max()}; -constexpr position_t cNegativePos1{-1}; -constexpr position_t cNegativePos2{-100}; -constexpr position_t cSetPos1{10}; -constexpr position_t cSetPos2{12}; -constexpr position_t cSetPos3{15}; -constexpr position_t cSetPos4{20}; -constexpr position_t cTreeSize1{4}; -constexpr position_t cTreeSize2{8}; - TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { + constexpr auto cRootId{PrefixTree::cRootId}; + constexpr position_t cInitialPos1{4}; + constexpr position_t cSetPos1{10}; + SECTION("Newly constructed tree works correctly") { PrefixTree const tree; @@ -34,16 +23,24 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Inserting nodes into the prefix tree works correctly") { + constexpr position_t cInitialPos2{7}; + constexpr position_t cInitialPos3{9}; + constexpr position_t cMaxPos{std::numeric_limits::max()}; + constexpr position_t cNegativePos1{-1}; + constexpr position_t cNegativePos2{-100}; + constexpr position_t cTreeSize1{4}; + constexpr position_t cTreeSize2{8}; + PrefixTree tree; // Test basic insertions - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - auto const node_id_2{tree.insert(node_id_1, cInsertPos2)}; - auto const node_id_3{tree.insert(node_id_2, cInsertPos3)}; - REQUIRE(std::vector{cInsertPos1} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{cInsertPos2, cInsertPos1} + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos2)}; + auto const node_id_3{tree.insert(node_id_2, cInitialPos3)}; + REQUIRE(std::vector{cInitialPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cInitialPos2, cInitialPos1} == tree.get_reversed_positions(node_id_2)); - REQUIRE(std::vector{cInsertPos3, cInsertPos2, cInsertPos1} + REQUIRE(std::vector{cInitialPos3, cInitialPos2, cInitialPos1} == tree.get_reversed_positions(node_id_3)); REQUIRE(cTreeSize1 == tree.size()); @@ -53,12 +50,12 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { // Test insertion with negative position values auto const node_id_5{tree.insert(cRootId, cNegativePos1)}; - auto const node_id_6{tree.insert(node_id_5, cInsertPos1)}; + auto const node_id_6{tree.insert(node_id_5, cInitialPos1)}; auto const node_id_7{tree.insert(node_id_6, cNegativePos2)}; REQUIRE(std::vector{cNegativePos1} == tree.get_reversed_positions(node_id_5)); - REQUIRE(std::vector{cInsertPos1, cNegativePos1} + REQUIRE(std::vector{cInitialPos1, cNegativePos1} == tree.get_reversed_positions(node_id_6)); - REQUIRE(std::vector{cNegativePos2, cInsertPos1, cNegativePos1} + REQUIRE(std::vector{cNegativePos2, cInitialPos1, cNegativePos1} == tree.get_reversed_positions(node_id_7)); REQUIRE(cTreeSize2 == tree.size()); } @@ -67,7 +64,7 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { PrefixTree tree; REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); - tree.insert(cRootId, cInsertPos1); + tree.insert(cRootId, cInitialPos1); REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); REQUIRE_THROWS_AS( @@ -77,13 +74,17 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Set position for a valid index works correctly") { + constexpr position_t cSetPos2{12}; + constexpr position_t cSetPos3{15}; + constexpr position_t cSetPos4{20}; + PrefixTree tree; // Test that you can set the root node for sanity, although this value is not used tree.set(cRootId, cSetPos1); // Test updates to different nodes - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - auto const node_id_2{tree.insert(node_id_1, cInsertPos1)}; + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos1)}; tree.set(node_id_1, cSetPos1); tree.set(node_id_2, cSetPos2); REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); @@ -105,13 +106,15 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Set position for an invalid index throws correctly") { + constexpr id_t cInvalidNodeId{100}; + PrefixTree tree; // Test setting position before any insertions - REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos4), std::out_of_range); + REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos1), std::out_of_range); // Test setting position just beyond valid range - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos4), std::out_of_range); + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos1), std::out_of_range); } } From 748dfc5566f6ab72df78bccbee99b1c9b503e2db Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 18:21:05 -0500 Subject: [PATCH 02/30] Rename to handler_init and return handler. --- tests/test-register-handler.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 2371fc9..e8102e2 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -12,17 +12,19 @@ using position_t = log_surgeon::finite_automata::PrefixTree::position_t; namespace { /** - * @param handler The register handler that will contain the new registers. - * @param num_registers The number of registers to initialize. + * @param num_registers The number of registers managed by the handler. + * @return The newly initialized register handler. */ -auto registers_init(RegisterHandler& handler, size_t num_registers) -> void; +[[nodiscard]] auto handler_init(size_t num_registers) -> RegisterHandler; -auto registers_init(RegisterHandler& handler, size_t const num_registers) -> void { +auto handler_init(size_t const num_registers) -> RegisterHandler { constexpr position_t cDefaultPos{0}; + RegisterHandler handler; for (size_t i{0}; i < num_registers; ++i) { handler.add_register(i, cDefaultPos); } + return handler; } } // namespace @@ -32,13 +34,12 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { constexpr size_t cRegId1{0}; constexpr size_t cRegId2{1}; - RegisterHandler handler; - SECTION("Initial state is empty") { - REQUIRE_THROWS_AS(handler.get_reversed_positions(cRegId1), std::out_of_range); + RegisterHandler empty_handler{handler_init(0)}; + REQUIRE_THROWS_AS(empty_handler.get_reversed_positions(cRegId1), std::out_of_range); } - registers_init(handler, cNumRegisters); + RegisterHandler handler{handler_init(cNumRegisters)}; SECTION("Set register position correctly") { handler.set_register(cRegId1, cInitialPos1); From 8abf35a15dfef1d26adfdfbc4895a20cb535a323 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 18:22:39 -0500 Subject: [PATCH 03/30] Add docstring for get_parent_id_unsafe(). --- src/log_surgeon/finite_automata/PrefixTree.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index ab88d80..e2de78a 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -65,6 +65,11 @@ class PrefixTree { [[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); } + /** + * Gets the parent ID without checking if it's `std::nullopt`. + * NOTE: This method should only be used if the caller has checked the node is not the root. + * @return The ID of the parent node in the prefix tree. + */ [[nodiscard]] auto get_parent_id_unsafe() const -> id_t { // NOLINTNEXTLINE(bugprone-unchecked-optional-access) return m_parent_id.value(); From 99b5b08608a53a199bcba3f57aedd64b7ba0bd0c Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Fri, 6 Dec 2024 15:49:58 -0500 Subject: [PATCH 04/30] feat: Add `PrefixTree` and `RegisterHandler` to support TDFA simulation. (#56) Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- CMakeLists.txt | 3 + .../finite_automata/PrefixTree.cpp | 20 +++ .../finite_automata/PrefixTree.hpp | 91 +++++++++++++ .../finite_automata/RegisterHandler.hpp | 52 ++++++++ tests/CMakeLists.txt | 5 +- tests/test-prefix-tree.cpp | 120 ++++++++++++++++++ tests/test-register-handler.cpp | 98 ++++++++++++++ 7 files changed, 388 insertions(+), 1 deletion(-) create mode 100644 src/log_surgeon/finite_automata/PrefixTree.cpp create mode 100644 src/log_surgeon/finite_automata/PrefixTree.hpp create mode 100644 src/log_surgeon/finite_automata/RegisterHandler.hpp create mode 100644 tests/test-prefix-tree.cpp create mode 100644 tests/test-register-handler.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e76ecb8..117cde5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,12 +93,15 @@ set(SOURCE_FILES src/log_surgeon/SchemaParser.hpp src/log_surgeon/Token.cpp src/log_surgeon/Token.hpp + src/log_surgeon/finite_automata/PrefixTree.cpp + src/log_surgeon/finite_automata/PrefixTree.hpp src/log_surgeon/finite_automata/RegexAST.hpp src/log_surgeon/finite_automata/RegexDFA.hpp src/log_surgeon/finite_automata/RegexDFA.tpp src/log_surgeon/finite_automata/RegexNFA.hpp src/log_surgeon/finite_automata/RegexNFAState.hpp src/log_surgeon/finite_automata/RegexNFAStateType.hpp + src/log_surgeon/finite_automata/RegisterHandler.hpp src/log_surgeon/finite_automata/Tag.hpp src/log_surgeon/finite_automata/TaggedTransition.hpp src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp new file mode 100644 index 0000000..4a65234 --- /dev/null +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -0,0 +1,20 @@ +#include "PrefixTree.hpp" + +#include +#include + +namespace log_surgeon::finite_automata { +auto PrefixTree::get_reversed_positions(id_t const node_id) const -> std::vector { + if (m_nodes.size() <= node_id) { + throw std::out_of_range("Prefix tree index out of range."); + } + + std::vector reversed_positions; + auto current_node{m_nodes[node_id]}; + while (false == current_node.is_root()) { + reversed_positions.push_back(current_node.get_position()); + current_node = m_nodes[current_node.get_parent_id_unsafe()]; + } + return reversed_positions; +} +} // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp new file mode 100644 index 0000000..e2de78a --- /dev/null +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -0,0 +1,91 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP + +#include +#include +#include +#include +#include + +namespace log_surgeon::finite_automata { +/** + * Represents a prefix tree to store register data during TDFA simulation. Each node in the tree + * stores a single position in the lexed string. Each path from the root to an index corresponds to + * a sequence of positions for an individual tag: + * - Positive position node: Indicates the tag was matched at the position. + * - Negative position node: Indicates the tag was unmatched. If a negative node is the entire path, + * it indicates the tag was never matched. If the negative tag is along a path containing positive + * nodes, it functions as a placeholder. This can be useful for nested capture groups, to maintain + * a one-to-one mapping between the contained capture group and the enclosing capture group. + */ +class PrefixTree { +public: + using id_t = uint32_t; + using position_t = int32_t; + + static constexpr id_t cRootId{0}; + + PrefixTree() : m_nodes{{std::nullopt, -1}} {} + + /** + * @param parent_node_id Index of the inserted node's parent in the prefix tree. + * @param position The position in the lexed string. + * @return The index of the newly inserted node in the tree. + * @throw std::out_of_range if the parent's index is out of range. + */ + [[maybe_unused]] auto insert(id_t const parent_node_id, position_t const position) -> id_t { + if (m_nodes.size() <= parent_node_id) { + throw std::out_of_range("Predecessor index out of range."); + } + + m_nodes.emplace_back(parent_node_id, position); + return m_nodes.size() - 1; + } + + auto set(id_t const node_id, position_t const position) -> void { + m_nodes.at(node_id).set_position(position); + } + + [[nodiscard]] auto size() const -> size_t { return m_nodes.size(); } + + /** + * @param node_id The index of the node. + * @return A vector containing positions in order from the given index up to but not including + * the root node. + * @throw std::out_of_range if the index is out of range. + */ + [[nodiscard]] auto get_reversed_positions(id_t node_id) const -> std::vector; + +private: + class Node { + public: + Node(std::optional const parent_id, position_t const position) + : m_parent_id{parent_id}, + m_position{position} {} + + [[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); } + + /** + * Gets the parent ID without checking if it's `std::nullopt`. + * NOTE: This method should only be used if the caller has checked the node is not the root. + * @return The ID of the parent node in the prefix tree. + */ + [[nodiscard]] auto get_parent_id_unsafe() const -> id_t { + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + return m_parent_id.value(); + } + + auto set_position(position_t const position) -> void { m_position = position; } + + [[nodiscard]] auto get_position() const -> position_t { return m_position; } + + private: + std::optional m_parent_id; + position_t m_position; + }; + + std::vector m_nodes; +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp new file mode 100644 index 0000000..d61240e --- /dev/null +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -0,0 +1,52 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP + +#include +#include + +#include + +namespace log_surgeon::finite_automata { +/** + * The register handler maintains a prefix tree that is sufficient to represent all registers. + * The register handler also contains a vector of registers, and performs the set, copy, and append + * operations for these registers. + * + * NOTE: For efficiency, registers are not initialized when lexing a new string; instead, it is the + * DFA's responsibility to set the register values when needed. + */ +class RegisterHandler { +public: + auto add_register( + PrefixTree::id_t const prefix_tree_parent_node_id, + PrefixTree::position_t const position + ) -> void { + auto const prefix_tree_node_id{m_prefix_tree.insert(prefix_tree_parent_node_id, position)}; + m_registers.emplace_back(prefix_tree_node_id); + } + + auto set_register(size_t const reg_id, PrefixTree::position_t const position) -> void { + m_prefix_tree.set(m_registers.at(reg_id), position); + } + + auto copy_register(size_t const dest_reg_id, size_t const source_reg_id) -> void { + m_registers.at(dest_reg_id) = m_registers.at(source_reg_id); + } + + auto append_position(size_t const reg_id, PrefixTree::position_t const position) -> void { + auto const node_id{m_registers.at(reg_id)}; + m_registers.at(reg_id) = m_prefix_tree.insert(node_id, position); + } + + [[nodiscard]] auto get_reversed_positions(size_t const reg_id + ) const -> std::vector { + return m_prefix_tree.get_reversed_positions(m_registers.at(reg_id)); + } + +private: + PrefixTree m_prefix_tree; + std::vector m_registers; +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d150252..ec974e6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,10 +2,13 @@ set( SOURCES_LOG_SURGEON ../src/log_surgeon/FileReader.cpp ../src/log_surgeon/FileReader.hpp + ../src/log_surgeon/finite_automata/PrefixTree.cpp + ../src/log_surgeon/finite_automata/PrefixTree.hpp ../src/log_surgeon/finite_automata/RegexAST.hpp ../src/log_surgeon/finite_automata/RegexNFA.hpp ../src/log_surgeon/finite_automata/RegexNFAState.hpp ../src/log_surgeon/finite_automata/RegexNFAStateType.hpp + ../src/log_surgeon/finite_automata/RegisterHandler.hpp ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp ../src/log_surgeon/LALR1Parser.cpp @@ -21,7 +24,7 @@ set( ../src/log_surgeon/Token.hpp ) -set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-tag.cpp) +set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-prefix-tree.cpp test-register-handler.cpp test-tag.cpp) add_executable(unit-test ${SOURCES_LOG_SURGEON} ${SOURCES_TESTS}) target_link_libraries(unit-test PRIVATE Catch2::Catch2WithMain log_surgeon::log_surgeon) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp new file mode 100644 index 0000000..66d8f8a --- /dev/null +++ b/tests/test-prefix-tree.cpp @@ -0,0 +1,120 @@ +#include +#include +#include + +#include + +#include + +using log_surgeon::finite_automata::PrefixTree; +using id_t = PrefixTree::id_t; +using position_t = PrefixTree::position_t; + +TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { + constexpr auto cRootId{PrefixTree::cRootId}; + constexpr position_t cInitialPos1{4}; + constexpr position_t cSetPos1{10}; + + SECTION("Newly constructed tree works correctly") { + PrefixTree const tree; + + // A newly constructed tree should return no positions as the root node is ignored + REQUIRE(tree.get_reversed_positions(cRootId).empty()); + } + + SECTION("Inserting nodes into the prefix tree works correctly") { + constexpr position_t cInitialPos2{7}; + constexpr position_t cInitialPos3{9}; + constexpr position_t cMaxPos{std::numeric_limits::max()}; + constexpr position_t cNegativePos1{-1}; + constexpr position_t cNegativePos2{-100}; + constexpr position_t cTreeSize1{4}; + constexpr position_t cTreeSize2{8}; + + PrefixTree tree; + + // Test basic insertions + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos2)}; + auto const node_id_3{tree.insert(node_id_2, cInitialPos3)}; + REQUIRE(std::vector{cInitialPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cInitialPos2, cInitialPos1} + == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{cInitialPos3, cInitialPos2, cInitialPos1} + == tree.get_reversed_positions(node_id_3)); + REQUIRE(cTreeSize1 == tree.size()); + + // Test insertion with large position values + auto const node_id_4{tree.insert(cRootId, cMaxPos)}; + REQUIRE(cMaxPos == tree.get_reversed_positions(node_id_4)[0]); + + // Test insertion with negative position values + auto const node_id_5{tree.insert(cRootId, cNegativePos1)}; + auto const node_id_6{tree.insert(node_id_5, cInitialPos1)}; + auto const node_id_7{tree.insert(node_id_6, cNegativePos2)}; + REQUIRE(std::vector{cNegativePos1} == tree.get_reversed_positions(node_id_5)); + REQUIRE(std::vector{cInitialPos1, cNegativePos1} + == tree.get_reversed_positions(node_id_6)); + REQUIRE(std::vector{cNegativePos2, cInitialPos1, cNegativePos1} + == tree.get_reversed_positions(node_id_7)); + REQUIRE(cTreeSize2 == tree.size()); + } + + SECTION("Invalid index access throws correctly") { + PrefixTree tree; + REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); + + tree.insert(cRootId, cInitialPos1); + REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); + + REQUIRE_THROWS_AS( + tree.get_reversed_positions(std::numeric_limits::max()), + std::out_of_range + ); + } + + SECTION("Set position for a valid index works correctly") { + constexpr position_t cSetPos2{12}; + constexpr position_t cSetPos3{15}; + constexpr position_t cSetPos4{20}; + + PrefixTree tree; + // Test that you can set the root node for sanity, although this value is not used + tree.set(cRootId, cSetPos1); + + // Test updates to different nodes + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos1)}; + tree.set(node_id_1, cSetPos1); + tree.set(node_id_2, cSetPos2); + REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cSetPos2, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + + // Test multiple updates to the same node + tree.set(node_id_2, cSetPos3); + tree.set(node_id_2, cSetPos4); + REQUIRE(std::vector{cSetPos4, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + + // Test that updates don't affect unrelated paths + auto const node_id_3{tree.insert(cRootId, cSetPos2)}; + tree.set(node_id_3, cSetPos3); + REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cSetPos4, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + } + + SECTION("Set position for an invalid index throws correctly") { + constexpr id_t cInvalidNodeId{100}; + + PrefixTree tree; + + // Test setting position before any insertions + REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos1), std::out_of_range); + + // Test setting position just beyond valid range + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos1), std::out_of_range); + } +} diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp new file mode 100644 index 0000000..e8102e2 --- /dev/null +++ b/tests/test-register-handler.cpp @@ -0,0 +1,98 @@ +#include +#include +#include + +#include + +#include +#include + +using log_surgeon::finite_automata::RegisterHandler; +using position_t = log_surgeon::finite_automata::PrefixTree::position_t; + +namespace { +/** + * @param num_registers The number of registers managed by the handler. + * @return The newly initialized register handler. + */ +[[nodiscard]] auto handler_init(size_t num_registers) -> RegisterHandler; + +auto handler_init(size_t const num_registers) -> RegisterHandler { + constexpr position_t cDefaultPos{0}; + + RegisterHandler handler; + for (size_t i{0}; i < num_registers; ++i) { + handler.add_register(i, cDefaultPos); + } + return handler; +} +} // namespace + +TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { + constexpr position_t cInitialPos1{5}; + constexpr size_t cNumRegisters{5}; + constexpr size_t cRegId1{0}; + constexpr size_t cRegId2{1}; + + SECTION("Initial state is empty") { + RegisterHandler empty_handler{handler_init(0)}; + REQUIRE_THROWS_AS(empty_handler.get_reversed_positions(cRegId1), std::out_of_range); + } + + RegisterHandler handler{handler_init(cNumRegisters)}; + + SECTION("Set register position correctly") { + handler.set_register(cRegId1, cInitialPos1); + REQUIRE(std::vector{cInitialPos1} == handler.get_reversed_positions(cRegId1)); + } + + SECTION("Register relationships are maintained") { + constexpr position_t cInitialPos2{10}; + constexpr position_t cInitialPos3{15}; + constexpr size_t cRegId3{2}; + + handler.set_register(cRegId1, cInitialPos1); + handler.set_register(cRegId2, cInitialPos2); + handler.set_register(cRegId3, cInitialPos3); + + auto positions{handler.get_reversed_positions(cRegId3)}; + REQUIRE(std::vector{cInitialPos3, cInitialPos2, cInitialPos1} + == handler.get_reversed_positions(cRegId3)); + } + + SECTION("Copy register index correctly") { + handler.set_register(cRegId1, cInitialPos1); + handler.copy_register(cRegId2, cRegId1); + REQUIRE(std::vector{cInitialPos1} == handler.get_reversed_positions(cRegId2)); + } + + SECTION("`append_position` appends position correctly") { + constexpr position_t cAppendPos{10}; + + handler.set_register(cRegId1, cInitialPos1); + handler.append_position(cRegId1, cAppendPos); + REQUIRE(std::vector{cAppendPos, cInitialPos1} + == handler.get_reversed_positions(cRegId1)); + } + + SECTION("Throws out of range correctly") { + constexpr size_t cInvalidRegId{10}; + + REQUIRE_THROWS_AS(handler.set_register(cInvalidRegId, cInitialPos1), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(cInvalidRegId, cRegId2), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(cRegId1, cInvalidRegId), std::out_of_range); + REQUIRE_THROWS_AS(handler.append_position(cInvalidRegId, cInitialPos1), std::out_of_range); + REQUIRE_THROWS_AS(handler.get_reversed_positions(cInvalidRegId), std::out_of_range); + } + + SECTION("Handles negative position values correctly") { + constexpr position_t cNegativePos1{-1}; + constexpr position_t cNegativePos2{-100}; + + handler.set_register(cRegId1, cNegativePos1); + handler.append_position(cRegId1, cInitialPos1); + handler.append_position(cRegId1, cNegativePos2); + REQUIRE(std::vector{cNegativePos2, cInitialPos1, cNegativePos1} + == handler.get_reversed_positions(cRegId1)); + } +} From a12a3607d6d4040f50acaa03c31b4a200e6f1a29 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Dec 2024 18:47:56 -0500 Subject: [PATCH 05/30] Fix comment length. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index a916b26..7277b6e 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -47,9 +47,8 @@ class RegexDFAState { private: std::vector m_matching_variable_ids; RegexDFAState* m_bytes_transition[cSizeOfByte]; - // NOTE: We don't need m_tree_transitions for the `stateType == - // RegexDFAStateType::Byte` case, so we use an empty class (`std::tuple<>`) - // in that case. + // NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case, + // so we use an empty class (`std::tuple<>`) in that case. std::conditional_t> m_tree_transitions; }; From 244d122ee630b2b347c6e95fbb2d16d8ef230eeb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Dec 2024 18:52:46 -0500 Subject: [PATCH 06/30] Initialize byte transitions. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index 7277b6e..90d83e5 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -22,6 +22,10 @@ class RegexDFAState { public: using Tree = UnicodeIntervalTree*>; + RegexDFAState() { + std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr); + } + auto add_matching_variable_id(uint32_t const variable_id) -> void { m_matching_variable_ids.push_back(variable_id); } From 176391b490f51d940aed2bb216446ed7d5be0958 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Dec 2024 19:17:54 -0500 Subject: [PATCH 07/30] Use const* in place of unique_ptr reference; Update docstrings. --- src/log_surgeon/finite_automata/RegexDFA.hpp | 26 +++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFA.hpp b/src/log_surgeon/finite_automata/RegexDFA.hpp index 63e0a80..3e8ad14 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.hpp +++ b/src/log_surgeon/finite_automata/RegexDFA.hpp @@ -14,10 +14,9 @@ template class RegexDFA { public: /** - * Creates a new DFA state based on a set of NFA states and adds it to - * m_states - * @param nfa_state_set - * @return DFAStateType* + * Creates a new DFA state based on a set of NFA states and adds it to `m_states`. + * @param nfa_state_set The set of NFA states represented by this DFA state. + * @return A pointer to the new DFA state. */ template auto new_state(std::set const& nfa_state_set) -> DFAStateType*; @@ -25,16 +24,14 @@ class RegexDFA { auto get_root() const -> DFAStateType const* { return m_states.at(0).get(); } /** - * Compares this dfa with dfa_in to determine the set of schema types in - * this dfa that are reachable by any type in dfa_in. A type is considered - * reachable if there is at least one string for which: (1) this dfa returns - * a set of types containing the type, and (2) dfa_in returns any non-empty - * set of types. - * @param dfa_in - * @return The set of schema types reachable by dfa_in + * Compares this dfa with `dfa_in` to determine the set of schema types in this dfa that are + * reachable by any type in `dfa_in`. A type is considered reachable if there is at least one + * string for which: (1) this dfa returns a set of types containing the type, and (2) `dfa_in` + * returns any non-empty set of types. + * @param dfa_in The dfa with which to take the intersect. + * @return The set of schema types reachable by `dfa_in`. */ - [[nodiscard]] auto get_intersect(std::unique_ptr const& dfa_in - ) const -> std::set; + [[nodiscard]] auto get_intersect(RegexDFA const* dfa_in) const -> std::set; private: std::vector> m_states; @@ -55,8 +52,7 @@ auto RegexDFA::new_state(std::set const& nfa_state_ } template -auto RegexDFA::get_intersect(std::unique_ptr const& dfa_in -) const -> std::set { +auto RegexDFA::get_intersect(RegexDFA const* dfa_in) const -> std::set { std::set schema_types; std::set> unvisited_pairs; std::set> visited_pairs; From 012f61f1c78eb7afcf888c7e7018c7d2503ca5e4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Dec 2024 19:20:17 -0500 Subject: [PATCH 08/30] Update intersect test to compile. --- examples/intersect-test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index a5d0e43..19d696b 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -42,7 +42,7 @@ auto get_intersect_for_query( } RegexNFA nfa(std::move(rules)); auto dfa2 = ByteLexer::nfa_to_dfa(nfa); - auto schema_types = dfa1->get_intersect(dfa2); + auto schema_types = dfa1->get_intersect(dfa2.get()); std::cout << search_string << ":"; for (auto const& schema_type : schema_types) { std::cout << m_id_symbol[schema_type] << ","; From 96a6363b2a4fcd27171e88a50e6a21b8cfd8f11e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Dec 2024 19:23:39 -0500 Subject: [PATCH 09/30] Update next() docstring. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index 90d83e5..8c3e179 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -41,10 +41,8 @@ class RegexDFAState { } /** - * Returns the next state the DFA transitions to on input character (byte or - * utf8) - * @param character - * @return RegexDFAState* + * @param character The character (byte or utf8) to transition on. + * @return A pointer to the DFA state reached after transitioning on `character`. */ [[nodiscard]] auto next(uint32_t character) const -> RegexDFAState*; From a4a93b4d47fe17582325cc9e139f3060afd57722 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:24:09 -0500 Subject: [PATCH 10/30] Rename to state_type. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index fb19214..98aedca 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -17,7 +17,7 @@ class RegexDFAState; using RegexDFAByteState = RegexDFAState; using RegexDFAUTF8State = RegexDFAState; -template +template class RegexDFAState { public: using Tree = UnicodeIntervalTree; @@ -49,14 +49,14 @@ class RegexDFAState { private: std::vector m_matching_variable_ids; RegexDFAState* m_bytes_transition[cSizeOfByte]; - // NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case, + // NOTE: We don't need m_tree_transitions for the `state_type == RegexDFAStateType::Byte` case, // so we use an empty class (`std::tuple<>`) in that case. - std::conditional_t> m_tree_transitions; + std::conditional_t> m_tree_transitions; }; -template -auto RegexDFAState::next(uint32_t character) const -> RegexDFAState* { - if constexpr (RegexDFAStateType::Byte == stateType) { +template +auto RegexDFAState::next(uint32_t character) const -> RegexDFAState* { + if constexpr (RegexDFAStateType::Byte == state_type) { return m_bytes_transition[character]; } else { if (character < cSizeOfByte) { From 421c3de5c132a73635bbf0b29fe1fefba4e1b07f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:29:35 -0500 Subject: [PATCH 11/30] Update headers. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 1 + src/log_surgeon/finite_automata/RegexDFAStatePair.hpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index 8c3e179..efa8e74 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include diff --git a/src/log_surgeon/finite_automata/RegexDFAStatePair.hpp b/src/log_surgeon/finite_automata/RegexDFAStatePair.hpp index 9672900..208a3e8 100644 --- a/src/log_surgeon/finite_automata/RegexDFAStatePair.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAStatePair.hpp @@ -1,10 +1,11 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_PAIR #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_PAIR +#include #include #include -#include +#include namespace log_surgeon::finite_automata { /** From 1b945a11058df5408997cde53288c0550ede86cb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:33:05 -0500 Subject: [PATCH 12/30] Update Lexer headers. --- src/log_surgeon/Lexer.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index ddb12cf..726ff68 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include From 78c41256d17a9d201cfef112158b63a92243ddad Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:37:37 -0500 Subject: [PATCH 13/30] Add header for conditional_t. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index efa8e74..5e99f2e 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -1,6 +1,7 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE +#include #include #include #include From 33623fa51bcd4dd8791d301c83a2ac787e80a05e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:39:58 -0500 Subject: [PATCH 14/30] Linter. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index 5e99f2e..92f5b23 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -1,11 +1,11 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE -#include #include #include #include #include +#include #include #include From 5bbeafce0c95afe89460b2655c84f5feb06e0f3b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:47:03 -0500 Subject: [PATCH 15/30] Linter. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index be2b0ae..e7a166d 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -53,7 +53,8 @@ class RegexDFAState { RegexDFAState* m_bytes_transition[cSizeOfByte]; // NOTE: We don't need m_tree_transitions for the `state_type == RegexDFAStateType::Byte` case, // so we use an empty class (`std::tuple<>`) in that case. - std::conditional_t> m_tree_transitions; + std::conditional_t> + m_tree_transitions; }; template From 0decaf50de260891bddf2aee9c6a2b126d12b04a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:57:17 -0500 Subject: [PATCH 16/30] Change ! to false ==. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index 92f5b23..3c0ef4c 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -36,7 +36,9 @@ class RegexDFAState { return m_matching_variable_ids; } - [[nodiscard]] auto is_accepting() const -> bool { return !m_matching_variable_ids.empty(); } + [[nodiscard]] auto is_accepting() const -> bool { + return false == m_matching_variable_ids.empty(); + } auto add_byte_transition(uint8_t const& byte, RegexDFAState* dest_state) -> void { m_bytes_transition[byte] = dest_state; @@ -67,7 +69,7 @@ auto RegexDFAState::next(uint32_t character) const -> RegexDFAState> result = m_tree_transitions.find(Interval(character, character)); assert(result->size() <= 1); - if (!result->empty()) { + if (false == result->empty()) { return result->front().m_value; } return nullptr; From 6e65a3e50a7c2e3b9df128e428409d66a9582ba4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 03:44:02 -0500 Subject: [PATCH 17/30] LALR1Parser to Lalr1Parser. --- CMakeLists.txt | 6 +-- src/log_surgeon/BufferParser.hpp | 4 +- .../{LALR1Parser.cpp => Lalr1Parser.cpp} | 2 +- .../{LALR1Parser.hpp => Lalr1Parser.hpp} | 6 +-- .../{LALR1Parser.tpp => Lalr1Parser.tpp} | 48 +++++++++---------- src/log_surgeon/LogParser.hpp | 6 +-- src/log_surgeon/ReaderParser.hpp | 4 +- src/log_surgeon/SchemaParser.cpp | 2 +- src/log_surgeon/SchemaParser.hpp | 4 +- tests/CMakeLists.txt | 6 +-- 10 files changed, 44 insertions(+), 44 deletions(-) rename src/log_surgeon/{LALR1Parser.cpp => Lalr1Parser.cpp} (93%) rename src/log_surgeon/{LALR1Parser.hpp => Lalr1Parser.hpp} (99%) rename src/log_surgeon/{LALR1Parser.tpp => Lalr1Parser.tpp} (95%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2627928..ceb932d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,9 +68,9 @@ set(SOURCE_FILES src/log_surgeon/Constants.hpp src/log_surgeon/FileReader.cpp src/log_surgeon/FileReader.hpp - src/log_surgeon/LALR1Parser.cpp - src/log_surgeon/LALR1Parser.hpp - src/log_surgeon/LALR1Parser.tpp + src/log_surgeon/Lalr1Parser.cpp + src/log_surgeon/Lalr1Parser.hpp + src/log_surgeon/Lalr1Parser.tpp src/log_surgeon/Lexer.hpp src/log_surgeon/Lexer.tpp src/log_surgeon/LexicalRule.hpp diff --git a/src/log_surgeon/BufferParser.hpp b/src/log_surgeon/BufferParser.hpp index 75eb41a..4a1a8e7 100644 --- a/src/log_surgeon/BufferParser.hpp +++ b/src/log_surgeon/BufferParser.hpp @@ -20,7 +20,7 @@ class BufferParser { /** * Constructs the parser using the given schema file. * @param schema_file_path - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure parsing the schema file or processing the schema * AST. */ @@ -29,7 +29,7 @@ class BufferParser { /** * Constructs the parser using the given schema AST. * @param schema_ast - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure processing the schema AST. */ explicit BufferParser(std::unique_ptr schema_ast); diff --git a/src/log_surgeon/LALR1Parser.cpp b/src/log_surgeon/Lalr1Parser.cpp similarity index 93% rename from src/log_surgeon/LALR1Parser.cpp rename to src/log_surgeon/Lalr1Parser.cpp index d1c4b7c..a7f54a6 100644 --- a/src/log_surgeon/LALR1Parser.cpp +++ b/src/log_surgeon/Lalr1Parser.cpp @@ -1,4 +1,4 @@ -#include "LALR1Parser.hpp" +#include "Lalr1Parser.hpp" namespace log_surgeon { MatchedSymbol NonTerminal::m_all_children[cSizeOfAllChildren]; diff --git a/src/log_surgeon/LALR1Parser.hpp b/src/log_surgeon/Lalr1Parser.hpp similarity index 99% rename from src/log_surgeon/LALR1Parser.hpp rename to src/log_surgeon/Lalr1Parser.hpp index d3ff39a..c13103c 100644 --- a/src/log_surgeon/LALR1Parser.hpp +++ b/src/log_surgeon/Lalr1Parser.hpp @@ -203,9 +203,9 @@ struct ItemSet { }; template -class LALR1Parser : public Parser { +class Lalr1Parser : public Parser { public: - LALR1Parser(); + Lalr1Parser(); /** * Add a lexical rule to m_lexer @@ -407,6 +407,6 @@ class LALR1Parser : public Parser { }; } // namespace log_surgeon -#include "LALR1Parser.tpp" +#include "Lalr1Parser.tpp" #endif // LOG_SURGEON_LALR1_PARSER_HPP diff --git a/src/log_surgeon/LALR1Parser.tpp b/src/log_surgeon/Lalr1Parser.tpp similarity index 95% rename from src/log_surgeon/LALR1Parser.tpp rename to src/log_surgeon/Lalr1Parser.tpp index ac03d3b..6e60d34 100644 --- a/src/log_surgeon/LALR1Parser.tpp +++ b/src/log_surgeon/Lalr1Parser.tpp @@ -54,7 +54,7 @@ namespace { } // namespace template -LALR1Parser::LALR1Parser() { +Lalr1Parser::Lalr1Parser() { m_terminals.insert((uint32_t)SymbolId::TokenEnd); m_terminals.insert((uint32_t)SymbolId::TokenUncaughtString); m_terminals.insert((uint32_t)SymbolId::TokenInt); @@ -66,7 +66,7 @@ LALR1Parser::LALR1Parser() { } template -void LALR1Parser::add_rule( +void Lalr1Parser::add_rule( std::string const& name, std::unique_ptr> rule ) { @@ -75,7 +75,7 @@ void LALR1Parser::add_rule( } template -void LALR1Parser::add_token_group( +void Lalr1Parser::add_token_group( std::string const& name, std::unique_ptr> rule_group ) { @@ -83,7 +83,7 @@ void LALR1Parser::add_token_group( } template -void LALR1Parser::add_token_chain( +void Lalr1Parser::add_token_chain( std::string const& name, std::string const& chain ) { @@ -110,7 +110,7 @@ void LALR1Parser::add_token_chain( } template -auto LALR1Parser::add_production( +auto Lalr1Parser::add_production( std::string const& head, std::vector const& body, SemanticRule semantic_rule @@ -151,7 +151,7 @@ auto LALR1Parser::add_production( } template -void LALR1Parser::generate() { +void Lalr1Parser::generate() { this->m_lexer.generate(); assert(!m_productions.empty()); generate_lr0_kernels(); @@ -161,7 +161,7 @@ void LALR1Parser::generate() { } template -void LALR1Parser::generate_lr0_kernels() { +void Lalr1Parser::generate_lr0_kernels() { Production* root_production_ptr = m_productions[m_root_production_id].get(); Item root_item(root_production_ptr, 0, cNullSymbol); std::unique_ptr item_set0 = std::make_unique(); @@ -191,7 +191,7 @@ void LALR1Parser::generate_lr0_kernels() { } template -auto LALR1Parser::lr_closure_helper( +auto Lalr1Parser::lr_closure_helper( ItemSet* item_set_ptr, Item const* item, uint32_t* next_symbol @@ -211,7 +211,7 @@ auto LALR1Parser::lr_closure_helper( } template -void LALR1Parser::generate_lr0_closure(ItemSet* item_set_ptr) { +void Lalr1Parser::generate_lr0_closure(ItemSet* item_set_ptr) { std::deque q( item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end() @@ -234,7 +234,7 @@ void LALR1Parser::generate_lr0_closure(ItemSet* item } template -auto LALR1Parser::go_to( +auto Lalr1Parser::go_to( ItemSet* from_item_set, uint32_t const& next_symbol ) -> ItemSet* { @@ -267,7 +267,7 @@ auto LALR1Parser::go_to( } template -void LALR1Parser::generate_first_sets() { +void Lalr1Parser::generate_first_sets() { for (uint32_t const& s : m_terminals) { m_firsts.insert(std::pair>(s, {s})); } @@ -299,7 +299,7 @@ void LALR1Parser::generate_first_sets() { } template -void LALR1Parser::generate_lr1_item_sets() { +void Lalr1Parser::generate_lr1_item_sets() { for (std::map, std::unique_ptr>::value_type const& kv : m_lr0_item_sets) { for (Item const& l0_item : kv.second->m_kernel) { @@ -383,7 +383,7 @@ void LALR1Parser::generate_lr1_item_sets() { } template -void LALR1Parser::generate_lr1_closure(ItemSet* item_set_ptr) { +void Lalr1Parser::generate_lr1_closure(ItemSet* item_set_ptr) { std::deque queue(item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end()); while (!queue.empty()) { Item item = queue.back(); @@ -419,19 +419,19 @@ void LALR1Parser::generate_lr1_closure(ItemSet* item } template -void LALR1Parser::generate_lalr1_parsing_table() { +void Lalr1Parser::generate_lalr1_parsing_table() { generate_lalr1_goto(); generate_lalr1_action(); } template -void LALR1Parser::generate_lalr1_goto() { +void Lalr1Parser::generate_lalr1_goto() { // done already at end of generate_lr1_item_sets()? } // Dragon book page 253 template -void LALR1Parser::generate_lalr1_action() { +void Lalr1Parser::generate_lalr1_action() { for (std::map, std::unique_ptr>::value_type const& kv : m_lr1_item_sets) { ItemSet* item_set_ptr = kv.second.get(); @@ -519,7 +519,7 @@ void LALR1Parser::generate_lalr1_action() { } template -auto LALR1Parser::get_input_after_last_newline( +auto Lalr1Parser::get_input_after_last_newline( std::stack& parse_stack_matches ) -> std::string { std::string error_message_reversed; @@ -558,7 +558,7 @@ auto LALR1Parser::get_input_after_last_newline( } template -auto LALR1Parser::get_input_until_next_newline(Token* error_token +auto Lalr1Parser::get_input_until_next_newline(Token* error_token ) -> std::string { std::string rest_of_line; bool next_is_end_token = (error_token->m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd); @@ -578,7 +578,7 @@ auto LALR1Parser::get_input_until_next_newline(Token } template -auto LALR1Parser::report_error() -> std::string { +auto Lalr1Parser::report_error() -> std::string { assert(m_next_token == std::nullopt); assert(!m_parse_stack_matches.empty()); MatchedSymbol top_symbol = std::move(m_parse_stack_matches.top()); @@ -629,7 +629,7 @@ auto LALR1Parser::report_error() -> std::string { } template -auto LALR1Parser::parse(Reader& reader) -> NonTerminal { +auto Lalr1Parser::parse(Reader& reader) -> NonTerminal { reset(); m_parse_stack_states.push(m_root_item_set_ptr); bool accept = false; @@ -651,7 +651,7 @@ auto LALR1Parser::parse(Reader& reader) -> NonTermin } template -void LALR1Parser::reset() { +void Lalr1Parser::reset() { m_next_token = std::nullopt; while (!m_parse_stack_states.empty()) { m_parse_stack_states.pop(); @@ -664,7 +664,7 @@ void LALR1Parser::reset() { } template -auto LALR1Parser::get_next_symbol() -> Token { +auto Lalr1Parser::get_next_symbol() -> Token { if (m_next_token == std::nullopt) { Token token; if (ErrorCode error = this->m_lexer.scan(m_input_buffer, token); @@ -680,7 +680,7 @@ auto LALR1Parser::get_next_symbol() -> Token { } template -auto LALR1Parser::parse_advance(Token& next_token, bool* accept) +auto Lalr1Parser::parse_advance(Token& next_token, bool* accept) -> bool { for (auto const type : *next_token.m_type_ids_ptr) { if (parse_symbol(type, next_token, accept)) { @@ -694,7 +694,7 @@ auto LALR1Parser::parse_advance(Token& next_token, b } template -auto LALR1Parser::parse_symbol( +auto Lalr1Parser::parse_symbol( uint32_t const& type_id, Token& next_token, bool* accept diff --git a/src/log_surgeon/LogParser.hpp b/src/log_surgeon/LogParser.hpp index 14d77f8..eef04a4 100644 --- a/src/log_surgeon/LogParser.hpp +++ b/src/log_surgeon/LogParser.hpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include @@ -26,7 +26,7 @@ class LogParser : public Parser schema_ast); diff --git a/src/log_surgeon/ReaderParser.hpp b/src/log_surgeon/ReaderParser.hpp index 9465efb..805cd7b 100644 --- a/src/log_surgeon/ReaderParser.hpp +++ b/src/log_surgeon/ReaderParser.hpp @@ -19,7 +19,7 @@ class ReaderParser { /** * Constructs the parser using the the given schema file. * @param schema_file_path - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure parsing the schema file or processing the schema * AST. */ @@ -28,7 +28,7 @@ class ReaderParser { /** * Constructs the parser using the given schema AST. * @param schema_ast - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure processing the schema AST. */ explicit ReaderParser(std::unique_ptr schema_ast); diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index c7c5e6a..b9f3bf6 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/log_surgeon/SchemaParser.hpp b/src/log_surgeon/SchemaParser.hpp index 748b94a..50ec3f0 100644 --- a/src/log_surgeon/SchemaParser.hpp +++ b/src/log_surgeon/SchemaParser.hpp @@ -5,7 +5,7 @@ #include #include -#include +#include namespace log_surgeon { // ASTs used in SchemaParser AST @@ -69,7 +69,7 @@ class DelimiterStringAST : public ParserAST { }; class SchemaParser - : public LALR1Parser { + : public Lalr1Parser { public: /** * File wrapper around generate_schema_ast() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1e4a836..0551615 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,9 +11,9 @@ set( ../src/log_surgeon/finite_automata/RegisterHandler.hpp ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp - ../src/log_surgeon/LALR1Parser.cpp - ../src/log_surgeon/LALR1Parser.hpp - ../src/log_surgeon/LALR1Parser.tpp + ../src/log_surgeon/Lalr1Parser.cpp + ../src/log_surgeon/Lalr1Parser.hpp + ../src/log_surgeon/Lalr1Parser.tpp ../src/log_surgeon/ParserInputBuffer.hpp ../src/log_surgeon/ParserInputBuffer.cpp ../src/log_surgeon/Schema.hpp From 9c2ad815702341124d70ef8c179ef30100904f43 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 04:14:24 -0500 Subject: [PATCH 18/30] Linter. --- src/log_surgeon/finite_automata/DfaState.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/DfaState.hpp b/src/log_surgeon/finite_automata/DfaState.hpp index cbc62ad..ca34a2c 100644 --- a/src/log_surgeon/finite_automata/DfaState.hpp +++ b/src/log_surgeon/finite_automata/DfaState.hpp @@ -24,9 +24,7 @@ class DfaState { public: using Tree = UnicodeIntervalTree; - DfaState() { - std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr); - } + DfaState() { std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr); } auto add_matching_variable_id(uint32_t const variable_id) -> void { m_matching_variable_ids.push_back(variable_id); From c4fc96b380689956fb42ffeb35532f7835d939df Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 06:58:01 -0500 Subject: [PATCH 19/30] Rename templates to TypedDfaState and TypedNfaState. --- src/log_surgeon/Lalr1Parser.hpp | 10 +- src/log_surgeon/Lalr1Parser.tpp | 122 +++---- src/log_surgeon/Lexer.hpp | 24 +- src/log_surgeon/Lexer.tpp | 114 +++---- src/log_surgeon/LexicalRule.hpp | 14 +- src/log_surgeon/Parser.hpp | 6 +- src/log_surgeon/Parser.tpp | 16 +- src/log_surgeon/finite_automata/Dfa.hpp | 26 +- .../finite_automata/DfaStatePair.hpp | 12 +- src/log_surgeon/finite_automata/Nfa.hpp | 94 +++--- src/log_surgeon/finite_automata/NfaState.hpp | 2 +- src/log_surgeon/finite_automata/RegexAST.hpp | 316 +++++++++--------- .../finite_automata/RegexDFAStateType.hpp | 2 +- .../finite_automata/TaggedTransition.hpp | 24 +- 14 files changed, 391 insertions(+), 391 deletions(-) diff --git a/src/log_surgeon/Lalr1Parser.hpp b/src/log_surgeon/Lalr1Parser.hpp index c13103c..e52fef1 100644 --- a/src/log_surgeon/Lalr1Parser.hpp +++ b/src/log_surgeon/Lalr1Parser.hpp @@ -202,8 +202,8 @@ struct ItemSet { std::vector m_actions; }; -template -class Lalr1Parser : public Parser { +template +class Lalr1Parser : public Parser { public: Lalr1Parser(); @@ -214,7 +214,7 @@ class Lalr1Parser : public Parser { */ auto add_rule( std::string const& name, - std::unique_ptr> rule + std::unique_ptr> rule ) -> void override; /** @@ -224,7 +224,7 @@ class Lalr1Parser : public Parser { */ auto add_token_group( std::string const& name, - std::unique_ptr> rule_group + std::unique_ptr> rule_group ) -> void; /** @@ -276,7 +276,7 @@ class Lalr1Parser : public Parser { */ auto report_error() -> std::string; - /* Lexer m_lexer; */ + /* Lexer m_lexer; */ std::stack m_parse_stack_matches; std::stack m_parse_stack_states; ItemSet* m_root_item_set_ptr{nullptr}; diff --git a/src/log_surgeon/Lalr1Parser.tpp b/src/log_surgeon/Lalr1Parser.tpp index 6e60d34..2c817d1 100644 --- a/src/log_surgeon/Lalr1Parser.tpp +++ b/src/log_surgeon/Lalr1Parser.tpp @@ -53,8 +53,8 @@ namespace { } } // namespace -template -Lalr1Parser::Lalr1Parser() { +template +Lalr1Parser::Lalr1Parser() { m_terminals.insert((uint32_t)SymbolId::TokenEnd); m_terminals.insert((uint32_t)SymbolId::TokenUncaughtString); m_terminals.insert((uint32_t)SymbolId::TokenInt); @@ -65,43 +65,43 @@ Lalr1Parser::Lalr1Parser() { m_terminals.insert((uint32_t)SymbolId::TokenNewline); } -template -void Lalr1Parser::add_rule( +template +void Lalr1Parser::add_rule( std::string const& name, - std::unique_ptr> rule + std::unique_ptr> rule ) { - Parser::add_rule(name, std::move(rule)); + Parser::add_rule(name, std::move(rule)); m_terminals.insert(this->m_lexer.m_symbol_id[name]); } -template -void Lalr1Parser::add_token_group( +template +void Lalr1Parser::add_token_group( std::string const& name, - std::unique_ptr> rule_group + std::unique_ptr> rule_group ) { add_rule(name, std::move(rule_group)); } -template -void Lalr1Parser::add_token_chain( +template +void Lalr1Parser::add_token_chain( std::string const& name, std::string const& chain ) { assert(chain.size() > 1); - std::unique_ptr> first_char_rule - = std::make_unique>(chain[0]); - std::unique_ptr> second_char_rule - = std::make_unique>(chain[1]); - std::unique_ptr> rule_chain - = std::make_unique>( + std::unique_ptr> first_char_rule + = std::make_unique>(chain[0]); + std::unique_ptr> second_char_rule + = std::make_unique>(chain[1]); + std::unique_ptr> rule_chain + = std::make_unique>( std::move(first_char_rule), std::move(second_char_rule) ); for (uint32_t i = 2; i < chain.size(); i++) { char next_char = chain[i]; - std::unique_ptr> next_char_rule - = std::make_unique>(next_char); - rule_chain = std::make_unique>( + std::unique_ptr> next_char_rule + = std::make_unique>(next_char); + rule_chain = std::make_unique>( std::move(rule_chain), std::move(next_char_rule) ); @@ -109,8 +109,8 @@ void Lalr1Parser::add_token_chain( add_rule(name, std::move(rule_chain)); } -template -auto Lalr1Parser::add_production( +template +auto Lalr1Parser::add_production( std::string const& head, std::vector const& body, SemanticRule semantic_rule @@ -150,8 +150,8 @@ auto Lalr1Parser::add_production( return n; } -template -void Lalr1Parser::generate() { +template +void Lalr1Parser::generate() { this->m_lexer.generate(); assert(!m_productions.empty()); generate_lr0_kernels(); @@ -160,8 +160,8 @@ void Lalr1Parser::generate() { generate_lalr1_parsing_table(); } -template -void Lalr1Parser::generate_lr0_kernels() { +template +void Lalr1Parser::generate_lr0_kernels() { Production* root_production_ptr = m_productions[m_root_production_id].get(); Item root_item(root_production_ptr, 0, cNullSymbol); std::unique_ptr item_set0 = std::make_unique(); @@ -190,8 +190,8 @@ void Lalr1Parser::generate_lr0_kernels() { } } -template -auto Lalr1Parser::lr_closure_helper( +template +auto Lalr1Parser::lr_closure_helper( ItemSet* item_set_ptr, Item const* item, uint32_t* next_symbol @@ -210,8 +210,8 @@ auto Lalr1Parser::lr_closure_helper( return false; } -template -void Lalr1Parser::generate_lr0_closure(ItemSet* item_set_ptr) { +template +void Lalr1Parser::generate_lr0_closure(ItemSet* item_set_ptr) { std::deque q( item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end() @@ -233,8 +233,8 @@ void Lalr1Parser::generate_lr0_closure(ItemSet* item } } -template -auto Lalr1Parser::go_to( +template +auto Lalr1Parser::go_to( ItemSet* from_item_set, uint32_t const& next_symbol ) -> ItemSet* { @@ -266,8 +266,8 @@ auto Lalr1Parser::go_to( return nullptr; } -template -void Lalr1Parser::generate_first_sets() { +template +void Lalr1Parser::generate_first_sets() { for (uint32_t const& s : m_terminals) { m_firsts.insert(std::pair>(s, {s})); } @@ -298,8 +298,8 @@ void Lalr1Parser::generate_first_sets() { } } -template -void Lalr1Parser::generate_lr1_item_sets() { +template +void Lalr1Parser::generate_lr1_item_sets() { for (std::map, std::unique_ptr>::value_type const& kv : m_lr0_item_sets) { for (Item const& l0_item : kv.second->m_kernel) { @@ -382,8 +382,8 @@ void Lalr1Parser::generate_lr1_item_sets() { } } -template -void Lalr1Parser::generate_lr1_closure(ItemSet* item_set_ptr) { +template +void Lalr1Parser::generate_lr1_closure(ItemSet* item_set_ptr) { std::deque queue(item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end()); while (!queue.empty()) { Item item = queue.back(); @@ -418,20 +418,20 @@ void Lalr1Parser::generate_lr1_closure(ItemSet* item } } -template -void Lalr1Parser::generate_lalr1_parsing_table() { +template +void Lalr1Parser::generate_lalr1_parsing_table() { generate_lalr1_goto(); generate_lalr1_action(); } -template -void Lalr1Parser::generate_lalr1_goto() { +template +void Lalr1Parser::generate_lalr1_goto() { // done already at end of generate_lr1_item_sets()? } // Dragon book page 253 -template -void Lalr1Parser::generate_lalr1_action() { +template +void Lalr1Parser::generate_lalr1_action() { for (std::map, std::unique_ptr>::value_type const& kv : m_lr1_item_sets) { ItemSet* item_set_ptr = kv.second.get(); @@ -518,8 +518,8 @@ void Lalr1Parser::generate_lalr1_action() { } } -template -auto Lalr1Parser::get_input_after_last_newline( +template +auto Lalr1Parser::get_input_after_last_newline( std::stack& parse_stack_matches ) -> std::string { std::string error_message_reversed; @@ -557,8 +557,8 @@ auto Lalr1Parser::get_input_after_last_newline( return error_message_reversed; } -template -auto Lalr1Parser::get_input_until_next_newline(Token* error_token +template +auto Lalr1Parser::get_input_until_next_newline(Token* error_token ) -> std::string { std::string rest_of_line; bool next_is_end_token = (error_token->m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd); @@ -577,8 +577,8 @@ auto Lalr1Parser::get_input_until_next_newline(Token return rest_of_line; } -template -auto Lalr1Parser::report_error() -> std::string { +template +auto Lalr1Parser::report_error() -> std::string { assert(m_next_token == std::nullopt); assert(!m_parse_stack_matches.empty()); MatchedSymbol top_symbol = std::move(m_parse_stack_matches.top()); @@ -604,7 +604,7 @@ auto Lalr1Parser::report_error() -> std::string { if (action.index() != 0) { error_type += "'"; if (auto* regex_ast_literal - = dynamic_cast*>( + = dynamic_cast*>( this->m_lexer.get_rule(i) )) { @@ -628,8 +628,8 @@ auto Lalr1Parser::report_error() -> std::string { return error_string; } -template -auto Lalr1Parser::parse(Reader& reader) -> NonTerminal { +template +auto Lalr1Parser::parse(Reader& reader) -> NonTerminal { reset(); m_parse_stack_states.push(m_root_item_set_ptr); bool accept = false; @@ -650,8 +650,8 @@ auto Lalr1Parser::parse(Reader& reader) -> NonTermin return std::move(std::get(m)); } -template -void Lalr1Parser::reset() { +template +void Lalr1Parser::reset() { m_next_token = std::nullopt; while (!m_parse_stack_states.empty()) { m_parse_stack_states.pop(); @@ -663,8 +663,8 @@ void Lalr1Parser::reset() { this->m_lexer.reset(); } -template -auto Lalr1Parser::get_next_symbol() -> Token { +template +auto Lalr1Parser::get_next_symbol() -> Token { if (m_next_token == std::nullopt) { Token token; if (ErrorCode error = this->m_lexer.scan(m_input_buffer, token); @@ -679,8 +679,8 @@ auto Lalr1Parser::get_next_symbol() -> Token { return s; } -template -auto Lalr1Parser::parse_advance(Token& next_token, bool* accept) +template +auto Lalr1Parser::parse_advance(Token& next_token, bool* accept) -> bool { for (auto const type : *next_token.m_type_ids_ptr) { if (parse_symbol(type, next_token, accept)) { @@ -693,8 +693,8 @@ auto Lalr1Parser::parse_advance(Token& next_token, b return true; } -template -auto Lalr1Parser::parse_symbol( +template +auto Lalr1Parser::parse_symbol( uint32_t const& type_id, Token& next_token, bool* accept diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 2872ae2..5d34315 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -20,7 +20,7 @@ #include namespace log_surgeon { -template +template class Lexer { public: static inline std::vector const cTokenEndTypes = {(uint32_t)SymbolId::TokenEnd}; @@ -29,11 +29,11 @@ class Lexer { /** * Generate a DFA from an NFA - * @param finite_automata::Nfa nfa - * @return std::unique_ptr> + * @param finite_automata::Nfa nfa + * @return std::unique_ptr> */ - static auto nfa_to_dfa(finite_automata::Nfa& nfa - ) -> std::unique_ptr>; + static auto nfa_to_dfa(finite_automata::Nfa& nfa + ) -> std::unique_ptr>; /** * Add a delimiters line from the schema to the lexer @@ -46,7 +46,7 @@ class Lexer { * @param id * @param regex */ - auto add_rule(uint32_t const& id, std::unique_ptr> rule) + auto add_rule(uint32_t const& id, std::unique_ptr> rule) -> void; /** @@ -54,7 +54,7 @@ class Lexer { * @param variable_id * @return finite_automata::RegexAST* */ - auto get_rule(uint32_t variable_id) -> finite_automata::RegexAST*; + auto get_rule(uint32_t variable_id) -> finite_automata::RegexAST*; /** * Generate DFA for lexer @@ -124,7 +124,7 @@ class Lexer { } [[nodiscard]] auto get_dfa( - ) const -> std::unique_ptr> const& { + ) const -> std::unique_ptr> const& { return m_dfa; } @@ -136,7 +136,7 @@ class Lexer { * Return epsilon_closure over m_epsilon_transitions * @return */ - static auto epsilon_closure(NfaStateType const* state_ptr) -> std::set; + static auto epsilon_closure(TypedNfaState const* state_ptr) -> std::set; /** * Get next character from the input buffer @@ -154,12 +154,12 @@ class Lexer { std::set m_type_ids_set; std::array m_is_delimiter{false}; std::array m_is_first_char{false}; - std::vector> m_rules; + std::vector> m_rules; uint32_t m_line{0}; bool m_has_delimiters{false}; - std::unique_ptr> m_dfa; + std::unique_ptr> m_dfa; bool m_asked_for_more_data{false}; - DfaStateType const* m_prev_state{nullptr}; + TypedDfaState const* m_prev_state{nullptr}; }; namespace lexers { diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index c5feb85..f3f3ef6 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -17,8 +17,8 @@ * 4 byte: 0x10000 - 0x1FFFFF : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ namespace log_surgeon { -template -void Lexer::flip_states(uint32_t old_storage_size) { +template +void Lexer::flip_states(uint32_t old_storage_size) { if (m_match_pos >= old_storage_size / 2) { m_match_pos -= old_storage_size / 2; } else { @@ -38,10 +38,10 @@ void Lexer::flip_states(uint32_t old_storage_size) { } } -template -auto Lexer::scan(ParserInputBuffer& input_buffer, Token& token) +template +auto Lexer::scan(ParserInputBuffer& input_buffer, Token& token) -> ErrorCode { - DfaStateType const* state = m_dfa->get_root(); + TypedDfaState const* state = m_dfa->get_root(); if (m_asked_for_more_data) { state = m_prev_state; m_asked_for_more_data = false; @@ -80,7 +80,7 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To m_match_pos = prev_byte_buf_pos; m_match_line = m_line; } - DfaStateType* next = state->next(next_char); + TypedDfaState* next = state->next(next_char); if (next_char == '\n') { m_line++; if (m_has_delimiters && !m_match) { @@ -166,13 +166,13 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To } // TODO: this is duplicating almost all the code of scan() -template -auto Lexer::scan_with_wildcard( +template +auto Lexer::scan_with_wildcard( ParserInputBuffer& input_buffer, char wildcard, Token& token ) -> ErrorCode { - DfaStateType const* state = m_dfa->get_root(); + TypedDfaState const* state = m_dfa->get_root(); if (m_asked_for_more_data) { state = m_prev_state; m_asked_for_more_data = false; @@ -211,7 +211,7 @@ auto Lexer::scan_with_wildcard( m_match_pos = prev_byte_buf_pos; m_match_line = m_line; } - DfaStateType const* next = state->next(next_char); + TypedDfaState const* next = state->next(next_char); if (next_char == '\n') { m_line++; if (m_has_delimiters && !m_match) { @@ -239,7 +239,7 @@ auto Lexer::scan_with_wildcard( // BFS (keep track of m_type_ids) if (wildcard == '?') { for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { - DfaStateType* next_state = state->next(byte); + TypedDfaState* next_state = state->next(byte); if (next_state->is_accepting() == false) { token = Token{m_last_match_pos, @@ -252,11 +252,11 @@ auto Lexer::scan_with_wildcard( } } } else if (wildcard == '*') { - std::stack unvisited_states; - std::set visited_states; + std::stack unvisited_states; + std::set visited_states; unvisited_states.push(state); while (!unvisited_states.empty()) { - DfaStateType const* current_state = unvisited_states.top(); + TypedDfaState const* current_state = unvisited_states.top(); if (current_state == nullptr || current_state->is_accepting() == false) { token = Token{m_last_match_pos, @@ -273,7 +273,7 @@ auto Lexer::scan_with_wildcard( if (m_is_delimiter[byte]) { continue; } - DfaStateType const* next_state = current_state->next(byte); + TypedDfaState const* next_state = current_state->next(byte); if (visited_states.find(next_state) == visited_states.end()) { unvisited_states.push(next_state); } @@ -299,8 +299,8 @@ auto Lexer::scan_with_wildcard( } } -template -auto Lexer::increase_buffer_capacity(ParserInputBuffer& input_buffer +template +auto Lexer::increase_buffer_capacity(ParserInputBuffer& input_buffer ) -> void { uint32_t old_storage_size{0}; bool flipped_static_buffer{false}; @@ -316,8 +316,8 @@ auto Lexer::increase_buffer_capacity(ParserInputBuff } } -template -void Lexer::reset() { +template +void Lexer::reset() { m_last_match_pos = 0; m_match = false; m_line = 0; @@ -330,8 +330,8 @@ void Lexer::reset() { m_prev_state = nullptr; } -template -void Lexer::prepend_start_of_file_char(ParserInputBuffer& input_buffer +template +void Lexer::prepend_start_of_file_char(ParserInputBuffer& input_buffer ) { m_prev_state = m_dfa->get_root()->next(utf8::cCharStartOfFile); m_asked_for_more_data = true; @@ -341,8 +341,8 @@ void Lexer::prepend_start_of_file_char(ParserInputBu m_type_ids = nullptr; } -template -void Lexer::add_delimiters(std::vector const& delimiters) { +template +void Lexer::add_delimiters(std::vector const& delimiters) { assert(!delimiters.empty()); m_has_delimiters = true; for (bool& i : m_is_delimiter) { @@ -354,17 +354,17 @@ void Lexer::add_delimiters(std::vector con m_is_delimiter[utf8::cCharStartOfFile] = true; } -template -void Lexer::add_rule( +template +void Lexer::add_rule( uint32_t const& id, - std::unique_ptr> rule + std::unique_ptr> rule ) { m_rules.emplace_back(id, std::move(rule)); } -template -auto Lexer::get_rule(uint32_t const variable_id -) -> finite_automata::RegexAST* { +template +auto Lexer::get_rule(uint32_t const variable_id +) -> finite_automata::RegexAST* { for (auto const& rule : m_rules) { if (rule.get_variable_id() == variable_id) { return rule.get_regex(); @@ -373,12 +373,12 @@ auto Lexer::get_rule(uint32_t const variable_id return nullptr; } -template -void Lexer::generate() { - finite_automata::Nfa nfa{std::move(m_rules)}; +template +void Lexer::generate() { + finite_automata::Nfa nfa{std::move(m_rules)}; // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" m_dfa = nfa_to_dfa(nfa); - DfaStateType const* state = m_dfa->get_root(); + TypedDfaState const* state = m_dfa->get_root(); for (uint32_t i = 0; i < cSizeOfByte; i++) { if (state->next(i) != nullptr) { m_is_first_char[i] = true; @@ -388,11 +388,11 @@ void Lexer::generate() { } } -template -auto Lexer::epsilon_closure(NfaStateType const* state_ptr -) -> std::set { - std::set closure_set; - std::stack stack; +template +auto Lexer::epsilon_closure(TypedNfaState const* state_ptr +) -> std::set { + std::set closure_set; + std::stack stack; stack.push(state_ptr); while (!stack.empty()) { auto const* current_state = stack.top(); @@ -425,17 +425,17 @@ auto Lexer::epsilon_closure(NfaStateType const* stat return closure_set; } -template -auto Lexer::nfa_to_dfa(finite_automata::Nfa& nfa -) -> std::unique_ptr> { - typedef std::set StateSet; - std::unique_ptr> dfa - = std::make_unique>(); - std::map dfa_states; +template +auto Lexer::nfa_to_dfa(finite_automata::Nfa& nfa +) -> std::unique_ptr> { + typedef std::set StateSet; + std::unique_ptr> dfa + = std::make_unique>(); + std::map dfa_states; std::stack unmarked_sets; auto create_dfa_state - = [&dfa, &dfa_states, &unmarked_sets](StateSet const& set) -> DfaStateType* { - DfaStateType* state = dfa->new_state(set); + = [&dfa, &dfa_states, &unmarked_sets](StateSet const& set) -> TypedDfaState* { + TypedDfaState* state = dfa->new_state(set); dfa_states[set] = state; unmarked_sets.push(set); return state; @@ -445,20 +445,20 @@ auto Lexer::nfa_to_dfa(finite_automata::Nfa ascii_transitions_map; // map transitions_map; - for (NfaStateType const* s0 : set) { + for (TypedNfaState const* s0 : set) { for (uint32_t i = 0; i < cSizeOfByte; i++) { - for (NfaStateType* const s1 : s0->get_byte_transitions(i)) { + for (TypedNfaState* const s1 : s0->get_byte_transitions(i)) { StateSet closure = epsilon_closure(s1); ascii_transitions_map[i].insert(closure.begin(), closure.end()); } } // TODO: add this for the utf8 case /* - for (const typename NfaStateType::Tree::Data& data : s0->get_tree_transitions().all()) { - for (NfaStateType* const s1 : data.m_value) { + for (const typename TypedNfaState::Tree::Data& data : s0->get_tree_transitions().all()) { + for (TypedNfaState* const s1 : data.m_value) { StateSet closure = epsilon_closure(s1); transitions_map[data.m_interval].insert(closure.begin(), closure.end()); } @@ -466,8 +466,8 @@ auto Lexer::nfa_to_dfa(finite_automata::Nfa DfaStateType* { - DfaStateType* state{nullptr}; + = [&dfa_states, &create_dfa_state](StateSet const& set) -> TypedDfaState* { + TypedDfaState* state{nullptr}; auto it = dfa_states.find(set); if (it == dfa_states.end()) { state = create_dfa_state(set); @@ -477,15 +477,15 @@ auto Lexer::nfa_to_dfa(finite_automata::Nfa::value_type const& kv : ascii_transitions_map) { - DfaStateType* dest_state = next_dfa_state(kv.second); + TypedDfaState* dest_state = next_dfa_state(kv.second); dfa_state->add_byte_transition(kv.first, dest_state); } // TODO: add this for the utf8 case /* - for (const typename map::value_type& kv : + for (const typename map::value_type& kv : transitions_map) { - DfaStateType* dest_state = next_dfa_state(kv.second); + TypedDfaState* dest_state = next_dfa_state(kv.second); dfa_state->add_tree_transition(kv.first, dest_state); } */ diff --git a/src/log_surgeon/LexicalRule.hpp b/src/log_surgeon/LexicalRule.hpp index f5f266b..6ab7e86 100644 --- a/src/log_surgeon/LexicalRule.hpp +++ b/src/log_surgeon/LexicalRule.hpp @@ -6,13 +6,13 @@ #include namespace log_surgeon { -template +template class LexicalRule { public: // Constructor LexicalRule( uint32_t const variable_id, - std::unique_ptr> regex + std::unique_ptr> regex ) : m_variable_id(variable_id), m_regex(std::move(regex)) {} @@ -21,22 +21,22 @@ class LexicalRule { * Adds AST representing the lexical rule to the NFA * @param nfa */ - auto add_to_nfa(finite_automata::Nfa* nfa) const -> void; + auto add_to_nfa(finite_automata::Nfa* nfa) const -> void; [[nodiscard]] auto get_variable_id() const -> uint32_t { return m_variable_id; } - [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { + [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { // TODO: make the returned pointer constant return m_regex.get(); } private: uint32_t m_variable_id; - std::unique_ptr> m_regex; + std::unique_ptr> m_regex; }; -template -void LexicalRule::add_to_nfa(finite_automata::Nfa* nfa) const { +template +void LexicalRule::add_to_nfa(finite_automata::Nfa* nfa) const { auto* end_state = nfa->new_state(); end_state->set_accepting(true); end_state->set_matching_variable_id(m_variable_id); diff --git a/src/log_surgeon/Parser.hpp b/src/log_surgeon/Parser.hpp index 0caf491..37d5734 100644 --- a/src/log_surgeon/Parser.hpp +++ b/src/log_surgeon/Parser.hpp @@ -5,19 +5,19 @@ namespace log_surgeon { -template +template class Parser { public: Parser(); virtual auto add_rule( std::string const& name, - std::unique_ptr> rule + std::unique_ptr> rule ) -> void; auto add_token(std::string const& name, char rule_char) -> void; - Lexer m_lexer; + Lexer m_lexer; }; } // namespace log_surgeon diff --git a/src/log_surgeon/Parser.tpp b/src/log_surgeon/Parser.tpp index 8d60ce7..4747072 100644 --- a/src/log_surgeon/Parser.tpp +++ b/src/log_surgeon/Parser.tpp @@ -7,8 +7,8 @@ namespace log_surgeon { -template -Parser::Parser() { +template +Parser::Parser() { // TODO move clp-reserved symbols out of the parser m_lexer.m_symbol_id[cTokenEnd] = (uint32_t)SymbolId::TokenEnd; m_lexer.m_symbol_id[cTokenUncaughtString] = (uint32_t)SymbolId::TokenUncaughtString; @@ -29,10 +29,10 @@ Parser::Parser() { m_lexer.m_id_symbol[(uint32_t)SymbolId::TokenNewline] = cTokenNewline; } -template -void Parser::add_rule( +template +void Parser::add_rule( std::string const& name, - std::unique_ptr> rule + std::unique_ptr> rule ) { if (m_lexer.m_symbol_id.find(name) == m_lexer.m_symbol_id.end()) { m_lexer.m_symbol_id[name] = m_lexer.m_symbol_id.size(); @@ -41,9 +41,9 @@ void Parser::add_rule( m_lexer.add_rule(m_lexer.m_symbol_id[name], std::move(rule)); } -template -void Parser::add_token(std::string const& name, char rule_char) { - add_rule(name, std::make_unique>(rule_char)); +template +void Parser::add_token(std::string const& name, char rule_char) { + add_rule(name, std::make_unique>(rule_char)); } } // namespace log_surgeon diff --git a/src/log_surgeon/finite_automata/Dfa.hpp b/src/log_surgeon/finite_automata/Dfa.hpp index ae1ea36..8c7e5f9 100644 --- a/src/log_surgeon/finite_automata/Dfa.hpp +++ b/src/log_surgeon/finite_automata/Dfa.hpp @@ -9,7 +9,7 @@ #include namespace log_surgeon::finite_automata { -template +template class Dfa { public: /** @@ -17,10 +17,10 @@ class Dfa { * @param nfa_state_set The set of NFA states represented by this DFA state. * @return A pointer to the new DFA state. */ - template - auto new_state(std::set const& nfa_state_set) -> DfaStateType*; + template + auto new_state(std::set const& nfa_state_set) -> TypedDfaState*; - auto get_root() const -> DfaStateType const* { return m_states.at(0).get(); } + auto get_root() const -> TypedDfaState const* { return m_states.at(0).get(); } /** * Compares this dfa with `dfa_in` to determine the set of schema types in this dfa that are @@ -33,13 +33,13 @@ class Dfa { [[nodiscard]] auto get_intersect(Dfa const* dfa_in) const -> std::set; private: - std::vector> m_states; + std::vector> m_states; }; -template -template -auto Dfa::new_state(std::set const& nfa_state_set) -> DfaStateType* { - m_states.emplace_back(std::make_unique()); +template +template +auto Dfa::new_state(std::set const& nfa_state_set) -> TypedDfaState* { + m_states.emplace_back(std::make_unique()); auto* dfa_state = m_states.back().get(); for (auto const* nfa_state : nfa_state_set) { if (nfa_state->is_accepting()) { @@ -49,11 +49,11 @@ auto Dfa::new_state(std::set const& nfa_state_set) return dfa_state; } -template -auto Dfa::get_intersect(Dfa const* dfa_in) const -> std::set { +template +auto Dfa::get_intersect(Dfa const* dfa_in) const -> std::set { std::set schema_types; - std::set> unvisited_pairs; - std::set> visited_pairs; + std::set> unvisited_pairs; + std::set> visited_pairs; unvisited_pairs.emplace(this->get_root(), dfa_in->get_root()); // TODO: Handle UTF-8 (multi-byte transitions) as well while (false == unvisited_pairs.empty()) { diff --git a/src/log_surgeon/finite_automata/DfaStatePair.hpp b/src/log_surgeon/finite_automata/DfaStatePair.hpp index 568142d..67ecb62 100644 --- a/src/log_surgeon/finite_automata/DfaStatePair.hpp +++ b/src/log_surgeon/finite_automata/DfaStatePair.hpp @@ -19,10 +19,10 @@ namespace log_surgeon::finite_automata { * * NOTE: Only the first state in the pair contains the variable types matched by the pair. */ -template +template class DfaStatePair { public: - DfaStatePair(DfaState const* state1, DfaState const* state2) + DfaStatePair(TypedDfaState const* state1, TypedDfaState const* state2) : m_state1(state1), m_state2(state2) {}; @@ -59,12 +59,12 @@ class DfaStatePair { } private: - DfaState const* m_state1; - DfaState const* m_state2; + TypedDfaState const* m_state1; + TypedDfaState const* m_state2; }; -template -auto DfaStatePair::get_reachable_pairs( +template +auto DfaStatePair::get_reachable_pairs( std::set& visited_pairs, std::set& unvisited_pairs ) const -> void { diff --git a/src/log_surgeon/finite_automata/Nfa.hpp b/src/log_surgeon/finite_automata/Nfa.hpp index caf58ce..8eaaaad 100644 --- a/src/log_surgeon/finite_automata/Nfa.hpp +++ b/src/log_surgeon/finite_automata/Nfa.hpp @@ -17,18 +17,18 @@ #include namespace log_surgeon::finite_automata { -template +template class Nfa { public: - using StateVec = std::vector; + using StateVec = std::vector; - explicit Nfa(std::vector> rules); + explicit Nfa(std::vector> rules); /** * Creates a unique_ptr for an NFA state with no tagged transitions and adds it to `m_states`. - * @return NfaStateType* + * @return TypedNfaState* */ - [[nodiscard]] auto new_state() -> NfaStateType*; + [[nodiscard]] auto new_state() -> TypedNfaState*; /** * Creates a unique_ptr for an NFA state with a positive tagged end transition and adds it to @@ -39,20 +39,20 @@ class Nfa { */ [[nodiscard]] auto new_state_with_positive_tagged_end_transition( Tag const* tag, - NfaStateType const* dest_state - ) -> NfaStateType*; + TypedNfaState const* dest_state + ) -> TypedNfaState*; /** * Creates a unique_ptr for an NFA state with a negative tagged transition and adds it to * `m_states`. * @param tags * @param dest_state - * @return NfaStateType* + * @return TypedNfaState* */ [[nodiscard]] auto new_state_with_negative_tagged_transition( std::vector tags, - NfaStateType const* dest_state - ) -> NfaStateType*; + TypedNfaState const* dest_state + ) -> TypedNfaState*; /** * Creates the start and end states for a capture group. @@ -64,38 +64,38 @@ class Nfa { */ [[nodiscard]] auto new_start_and_end_states_with_positive_tagged_transitions( Tag const* tag, - NfaStateType const* dest_state - ) -> std::pair; + TypedNfaState const* dest_state + ) -> std::pair; /** * @return A vector representing the traversal order of the NFA states using breadth-first * search (BFS). */ - [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; + [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; /** * @return A string representation of the NFA. */ [[nodiscard]] auto serialize() const -> std::string; - auto add_root_interval(Interval interval, NfaStateType* dest_state) -> void { + auto add_root_interval(Interval interval, TypedNfaState* dest_state) -> void { m_root->add_interval(interval, dest_state); } - auto set_root(NfaStateType* root) -> void { m_root = root; } + auto set_root(TypedNfaState* root) -> void { m_root = root; } - auto get_root() -> NfaStateType* { return m_root; } + auto get_root() -> TypedNfaState* { return m_root; } private: - std::vector> m_states; - NfaStateType* m_root; + std::vector> m_states; + TypedNfaState* m_root; // Store the rules locally as they contain information needed by the NFA. E.g., transitions in // the NFA point to tags in the rule ASTs. - std::vector> m_rules; + std::vector> m_rules; }; -template -Nfa::Nfa(std::vector> rules) +template +Nfa::Nfa(std::vector> rules) : m_root{new_state()}, m_rules{std::move(rules)} { for (auto const& rule : m_rules) { @@ -103,35 +103,35 @@ Nfa::Nfa(std::vector> rules) } } -template -auto Nfa::new_state() -> NfaStateType* { - m_states.emplace_back(std::make_unique()); +template +auto Nfa::new_state() -> TypedNfaState* { + m_states.emplace_back(std::make_unique()); return m_states.back().get(); } -template -auto Nfa::new_state_with_positive_tagged_end_transition( +template +auto Nfa::new_state_with_positive_tagged_end_transition( Tag const* tag, - NfaStateType const* dest_state -) -> NfaStateType* { - m_states.emplace_back(std::make_unique(tag, dest_state)); + TypedNfaState const* dest_state +) -> TypedNfaState* { + m_states.emplace_back(std::make_unique(tag, dest_state)); return m_states.back().get(); } -template -auto Nfa::new_state_with_negative_tagged_transition( +template +auto Nfa::new_state_with_negative_tagged_transition( std::vector tags, - NfaStateType const* dest_state -) -> NfaStateType* { - m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); + TypedNfaState const* dest_state +) -> TypedNfaState* { + m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); return m_states.back().get(); } -template -auto Nfa::new_start_and_end_states_with_positive_tagged_transitions( +template +auto Nfa::new_start_and_end_states_with_positive_tagged_transitions( Tag const* tag, - NfaStateType const* dest_state -) -> std::pair { + TypedNfaState const* dest_state +) -> std::pair { auto* start_state = new_state(); m_root->add_positive_tagged_start_transition(tag, start_state); @@ -139,16 +139,16 @@ auto Nfa::new_start_and_end_states_with_positive_tagged_transition return {start_state, end_state}; } -template -auto Nfa::get_bfs_traversal_order() const -> std::vector { - std::queue state_queue; - std::unordered_set visited_states; - std::vector visited_order; +template +auto Nfa::get_bfs_traversal_order() const -> std::vector { + std::queue state_queue; + std::unordered_set visited_states; + std::vector visited_order; visited_states.reserve(m_states.size()); visited_order.reserve(m_states.size()); auto add_to_queue_and_visited - = [&state_queue, &visited_states](NfaStateType const* dest_state) { + = [&state_queue, &visited_states](TypedNfaState const* dest_state) { if (visited_states.insert(dest_state).second) { state_queue.push(dest_state); } @@ -190,11 +190,11 @@ auto Nfa::get_bfs_traversal_order() const -> std::vector -auto Nfa::serialize() const -> std::string { +template +auto Nfa::serialize() const -> std::string { auto const traversal_order = get_bfs_traversal_order(); - std::unordered_map state_ids; + std::unordered_map state_ids; for (auto const* state : traversal_order) { state_ids.emplace(state, state_ids.size()); } diff --git a/src/log_surgeon/finite_automata/NfaState.hpp b/src/log_surgeon/finite_automata/NfaState.hpp index 7c406dc..5c10384 100644 --- a/src/log_surgeon/finite_automata/NfaState.hpp +++ b/src/log_surgeon/finite_automata/NfaState.hpp @@ -113,7 +113,7 @@ class NfaState { std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; // NOTE: We don't need m_tree_transitions for the `stateType == - // DfaStateType::Byte` case, so we use an empty class (`std::tuple<>`) + // NfaStateType::Byte` case, so we use an empty class (`std::tuple<>`) // in that case. std::conditional_t> m_tree_transitions; }; diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 9573c23..010d533 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -24,7 +24,7 @@ #include namespace log_surgeon::finite_automata { -template +template class Nfa; // TODO: rename `RegexAST` to `RegexASTNode` @@ -40,9 +40,9 @@ class Nfa; * ASTs built using this class are assumed to be constructed in a bottom-up manner, where all * descendant nodes are created first. * - * @tparam NfaStateType Whether this AST is used for byte lexing or UTF-8 lexing. + * @tparam TypedNfaState Whether this AST is used for byte lexing or UTF-8 lexing. */ -template +template class RegexAST { public: RegexAST() = default; @@ -75,7 +75,7 @@ class RegexAST { * @param nfa * @param end_state */ - virtual auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void = 0; + virtual auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void = 0; /** * Serializes the AST with this node as the root. @@ -109,7 +109,7 @@ class RegexAST { * @param end_state */ auto - add_to_nfa_with_negative_tags(Nfa* nfa, NfaStateType* end_state) const -> void { + add_to_nfa_with_negative_tags(Nfa* nfa, TypedNfaState* end_state) const -> void { // Handle negative tags as: // root --(regex)--> state_with_negative_tagged_transition --(negative tags)--> end_state if (false == m_negative_tags.empty()) { @@ -155,10 +155,10 @@ class RegexAST { * repetition with a minimum repetition of 0. Namely, we treat `R{0,N}` as `R{1,N} | ∅`. Then, the * NFA handles the 0 repetition case using the logic in `RegexASTOR` (i.e., adding a negative * transition for every capture group matched in `R{1,N}`). - * @tparam NfaStateType Whether this AST is used for byte lexing or UTF-8 lexing. + * @tparam TypedNfaState Whether this AST is used for byte lexing or UTF-8 lexing. */ -template -class RegexASTEmpty : public RegexAST { +template +class RegexASTEmpty : public RegexAST { public: RegexASTEmpty() = default; @@ -178,8 +178,8 @@ class RegexASTEmpty : public RegexAST { } auto add_to_nfa( - [[maybe_unused]] Nfa* nfa, - [[maybe_unused]] NfaStateType* end_state + [[maybe_unused]] Nfa* nfa, + [[maybe_unused]] TypedNfaState* end_state ) const -> void override { // Do nothing as adding an empty node to the NFA is a null operation. } @@ -187,8 +187,8 @@ class RegexASTEmpty : public RegexAST { [[nodiscard]] auto serialize() const -> std::u32string override; }; -template -class RegexASTLiteral : public RegexAST { +template +class RegexASTLiteral : public RegexAST { public: explicit RegexASTLiteral(uint32_t character); @@ -226,7 +226,7 @@ class RegexASTLiteral : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; @@ -236,8 +236,8 @@ class RegexASTLiteral : public RegexAST { uint32_t m_character; }; -template -class RegexASTInteger : public RegexAST { +template +class RegexASTInteger : public RegexAST { public: explicit RegexASTInteger(uint32_t digit); @@ -279,7 +279,7 @@ class RegexASTInteger : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; @@ -291,24 +291,24 @@ class RegexASTInteger : public RegexAST { std::vector m_digits; }; -template -class RegexASTGroup : public RegexAST { +template +class RegexASTGroup : public RegexAST { public: using Range = std::pair; RegexASTGroup() = default; - explicit RegexASTGroup(RegexASTLiteral const* right); + explicit RegexASTGroup(RegexASTLiteral const* right); explicit RegexASTGroup(RegexASTGroup const* right); - RegexASTGroup(RegexASTGroup const* left, RegexASTLiteral const* right); + RegexASTGroup(RegexASTGroup const* left, RegexASTLiteral const* right); RegexASTGroup(RegexASTGroup const* left, RegexASTGroup const* right); RegexASTGroup( - RegexASTLiteral const* left, - RegexASTLiteral const* right + RegexASTLiteral const* left, + RegexASTLiteral const* right ); RegexASTGroup(uint32_t min, uint32_t max); @@ -387,7 +387,7 @@ class RegexASTGroup : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; @@ -424,20 +424,20 @@ class RegexASTGroup : public RegexAST { std::vector m_ranges; }; -template -class RegexASTOr : public RegexAST { +template +class RegexASTOr : public RegexAST { public: ~RegexASTOr() override = default; RegexASTOr( - std::unique_ptr> left, - std::unique_ptr> right + std::unique_ptr> left, + std::unique_ptr> right ); RegexASTOr(RegexASTOr const& rhs) - : RegexAST(rhs), - m_left(std::unique_ptr>(rhs.m_left->clone())), - m_right(std::unique_ptr>(rhs.m_right->clone())) {} + : RegexAST(rhs), + m_left(std::unique_ptr>(rhs.m_left->clone())), + m_right(std::unique_ptr>(rhs.m_right->clone())) {} /** * Used for cloning a unique_pointer of type RegexASTOr @@ -474,33 +474,33 @@ class RegexASTOr : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } + [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } - [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } + [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } private: - std::unique_ptr> m_left; - std::unique_ptr> m_right; + std::unique_ptr> m_left; + std::unique_ptr> m_right; }; -template -class RegexASTCat : public RegexAST { +template +class RegexASTCat : public RegexAST { public: ~RegexASTCat() override = default; RegexASTCat( - std::unique_ptr> left, - std::unique_ptr> right + std::unique_ptr> left, + std::unique_ptr> right ); RegexASTCat(RegexASTCat const& rhs) - : RegexAST(rhs), - m_left(std::unique_ptr>(rhs.m_left->clone())), - m_right(std::unique_ptr>(rhs.m_right->clone())) {} + : RegexAST(rhs), + m_left(std::unique_ptr>(rhs.m_left->clone())), + m_right(std::unique_ptr>(rhs.m_right->clone())) {} /** * Used for cloning a unique_pointer of type RegexASTCat @@ -537,33 +537,33 @@ class RegexASTCat : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } + [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } - [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } + [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } private: - std::unique_ptr> m_left; - std::unique_ptr> m_right; + std::unique_ptr> m_left; + std::unique_ptr> m_right; }; -template -class RegexASTMultiplication : public RegexAST { +template +class RegexASTMultiplication : public RegexAST { public: ~RegexASTMultiplication() override = default; RegexASTMultiplication( - std::unique_ptr> operand, + std::unique_ptr> operand, uint32_t min, uint32_t max ); RegexASTMultiplication(RegexASTMultiplication const& rhs) - : RegexAST(rhs), - m_operand(std::unique_ptr>(rhs.m_operand->clone())), + : RegexAST(rhs), + m_operand(std::unique_ptr>(rhs.m_operand->clone())), m_min(rhs.m_min), m_max(rhs.m_max) {} @@ -601,13 +601,13 @@ class RegexASTMultiplication : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; [[nodiscard]] auto is_infinite() const -> bool { return this->m_max == 0; } - [[nodiscard]] auto get_operand() const -> std::unique_ptr> const& { + [[nodiscard]] auto get_operand() const -> std::unique_ptr> const& { return m_operand; } @@ -616,7 +616,7 @@ class RegexASTMultiplication : public RegexAST { [[nodiscard]] auto get_max() const -> uint32_t { return m_max; } private: - std::unique_ptr> m_operand; + std::unique_ptr> m_operand; uint32_t m_min; uint32_t m_max; }; @@ -626,10 +626,10 @@ class RegexASTMultiplication : public RegexAST { * NOTE: * - `m_tag` is always expected to be non-null. * - `m_group_regex_ast` is always expected to be non-null. - * @tparam NfaStateType Specifies the type of transition (bytes or UTF-8 characters). + * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ -template -class RegexASTCapture : public RegexAST { +template +class RegexASTCapture : public RegexAST { public: ~RegexASTCapture() override = default; @@ -639,7 +639,7 @@ class RegexASTCapture : public RegexAST { * @throw std::invalid_argument if `group_regex_ast` or `tag` are `nullptr`. */ RegexASTCapture( - std::unique_ptr> group_regex_ast, + std::unique_ptr> group_regex_ast, std::unique_ptr tag ) : m_group_regex_ast{( @@ -649,19 +649,19 @@ class RegexASTCapture : public RegexAST { )}, m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : std::move(tag)} { - RegexAST::set_subtree_positive_tags( + RegexAST::set_subtree_positive_tags( m_group_regex_ast->get_subtree_positive_tags() ); - RegexAST::add_subtree_positive_tags({m_tag.get()}); + RegexAST::add_subtree_positive_tags({m_tag.get()}); } RegexASTCapture(RegexASTCapture const& rhs) - : RegexAST{rhs}, + : RegexAST{rhs}, m_group_regex_ast{ - std::unique_ptr>(rhs.m_group_regex_ast->clone()) + std::unique_ptr>(rhs.m_group_regex_ast->clone()) }, m_tag{std::make_unique(*rhs.m_tag)} { - RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); } /** @@ -697,159 +697,159 @@ class RegexASTCapture : public RegexAST { * @param nfa * @param dest_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* dest_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* dest_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; [[nodiscard]] auto get_group_name() const -> std::string_view { return m_tag->get_name(); } [[nodiscard]] auto get_group_regex_ast( - ) const -> std::unique_ptr> const& { + ) const -> std::unique_ptr> const& { return m_group_regex_ast; } private: - std::unique_ptr> m_group_regex_ast; + std::unique_ptr> m_group_regex_ast; std::unique_ptr m_tag; }; -template -[[nodiscard]] auto RegexASTEmpty::serialize() const -> std::u32string { - return fmt::format(U"{}", RegexAST::serialize_negative_tags()); +template +[[nodiscard]] auto RegexASTEmpty::serialize() const -> std::u32string { + return fmt::format(U"{}", RegexAST::serialize_negative_tags()); } -template -RegexASTLiteral::RegexASTLiteral(uint32_t character) : m_character(character) {} +template +RegexASTLiteral::RegexASTLiteral(uint32_t character) : m_character(character) {} -template -void RegexASTLiteral::add_to_nfa(Nfa* nfa, NfaStateType* end_state) +template +void RegexASTLiteral::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { nfa->add_root_interval(Interval(m_character, m_character), end_state); } -template -[[nodiscard]] auto RegexASTLiteral::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTLiteral::serialize() const -> std::u32string { return fmt::format( U"{}{}", static_cast(m_character), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTInteger::RegexASTInteger(uint32_t digit) { +template +RegexASTInteger::RegexASTInteger(uint32_t digit) { digit = digit - '0'; m_digits.push_back(digit); } -template -RegexASTInteger::RegexASTInteger(RegexASTInteger* left, uint32_t digit) +template +RegexASTInteger::RegexASTInteger(RegexASTInteger* left, uint32_t digit) : m_digits(std::move(left->m_digits)) { digit = digit - '0'; m_digits.push_back(digit); } -template -void RegexASTInteger::add_to_nfa( - [[maybe_unused]] Nfa* nfa, - [[maybe_unused]] NfaStateType* end_state +template +void RegexASTInteger::add_to_nfa( + [[maybe_unused]] Nfa* nfa, + [[maybe_unused]] TypedNfaState* end_state ) const { throw std::runtime_error("Unsupported"); } -template -[[nodiscard]] auto RegexASTInteger::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTInteger::serialize() const -> std::u32string { auto const digits_string = fmt::format("{}", fmt::join(m_digits, "")); return fmt::format( U"{}{}", std::u32string(digits_string.begin(), digits_string.end()), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTOr::RegexASTOr( - std::unique_ptr> left, - std::unique_ptr> right +template +RegexASTOr::RegexASTOr( + std::unique_ptr> left, + std::unique_ptr> right ) : m_left(std::move(left)), m_right(std::move(right)) { m_left->set_negative_tags(m_right->get_subtree_positive_tags()); m_right->set_negative_tags(m_left->get_subtree_positive_tags()); - RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); - RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); + RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); } -template -void RegexASTOr::add_to_nfa(Nfa* nfa, NfaStateType* end_state) const { +template +void RegexASTOr::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { m_left->add_to_nfa_with_negative_tags(nfa, end_state); m_right->add_to_nfa_with_negative_tags(nfa, end_state); } -template -[[nodiscard]] auto RegexASTOr::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTOr::serialize() const -> std::u32string { return fmt::format( U"({})|({}){}", nullptr != m_left ? m_left->serialize() : U"null", nullptr != m_right ? m_right->serialize() : U"null", - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTCat::RegexASTCat( - std::unique_ptr> left, - std::unique_ptr> right +template +RegexASTCat::RegexASTCat( + std::unique_ptr> left, + std::unique_ptr> right ) : m_left(std::move(left)), m_right(std::move(right)) { - RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); - RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); + RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); } -template -void RegexASTCat::add_to_nfa(Nfa* nfa, NfaStateType* end_state) const { - NfaStateType* saved_root = nfa->get_root(); - NfaStateType* intermediate_state = nfa->new_state(); +template +void RegexASTCat::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { + TypedNfaState* saved_root = nfa->get_root(); + TypedNfaState* intermediate_state = nfa->new_state(); m_left->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); m_right->add_to_nfa_with_negative_tags(nfa, end_state); nfa->set_root(saved_root); } -template -[[nodiscard]] auto RegexASTCat::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTCat::serialize() const -> std::u32string { return fmt::format( U"{}{}{}", nullptr != m_left ? m_left->serialize() : U"null", nullptr != m_right ? m_right->serialize() : U"null", - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTMultiplication::RegexASTMultiplication( - std::unique_ptr> operand, +template +RegexASTMultiplication::RegexASTMultiplication( + std::unique_ptr> operand, uint32_t const min, uint32_t const max ) : m_operand(std::move(operand)), m_min(min), m_max(max) { - RegexAST::set_subtree_positive_tags(m_operand->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(m_operand->get_subtree_positive_tags()); } -template -void RegexASTMultiplication::add_to_nfa( - Nfa* nfa, - NfaStateType* end_state +template +void RegexASTMultiplication::add_to_nfa( + Nfa* nfa, + TypedNfaState* end_state ) const { - NfaStateType* saved_root = nfa->get_root(); + TypedNfaState* saved_root = nfa->get_root(); if (this->m_min == 0) { nfa->get_root()->add_epsilon_transition(end_state); } else { for (uint32_t i = 1; i < this->m_min; i++) { - NfaStateType* intermediate_state = nfa->new_state(); + TypedNfaState* intermediate_state = nfa->new_state(); m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } @@ -860,13 +860,13 @@ void RegexASTMultiplication::add_to_nfa( m_operand->add_to_nfa_with_negative_tags(nfa, end_state); } else if (this->m_max > this->m_min) { if (this->m_min != 0) { - NfaStateType* intermediate_state = nfa->new_state(); + TypedNfaState* intermediate_state = nfa->new_state(); m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } for (uint32_t i = this->m_min + 1; i < this->m_max; ++i) { m_operand->add_to_nfa_with_negative_tags(nfa, end_state); - NfaStateType* intermediate_state = nfa->new_state(); + TypedNfaState* intermediate_state = nfa->new_state(); m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } @@ -875,8 +875,8 @@ void RegexASTMultiplication::add_to_nfa( nfa->set_root(saved_root); } -template -[[nodiscard]] auto RegexASTMultiplication::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTMultiplication::serialize() const -> std::u32string { auto const min_string = std::to_string(m_min); auto const max_string = std::to_string(m_max); @@ -885,12 +885,12 @@ template nullptr != m_operand ? m_operand->serialize() : U"null", std::u32string(min_string.begin(), min_string.end()), is_infinite() ? U"inf" : std::u32string(max_string.begin(), max_string.end()), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -auto RegexASTCapture::add_to_nfa(Nfa* nfa, NfaStateType* dest_state) +template +auto RegexASTCapture::add_to_nfa(Nfa* nfa, TypedNfaState* dest_state) const -> void { // TODO: move this into a documentation file in the future, and reference it here. // The NFA constructed for a capture group follows the structure below, with tagged transitions @@ -935,21 +935,21 @@ auto RegexASTCapture::add_to_nfa(Nfa* nfa, NfaStateT nfa->set_root(initial_root); } -template -[[nodiscard]] auto RegexASTCapture::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTCapture::serialize() const -> std::u32string { auto const tag_name_u32 = std::u32string(m_tag->get_name().cbegin(), m_tag->get_name().cend()); return fmt::format( U"({})<{}>{}", m_group_regex_ast->serialize(), tag_name_u32, - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTGroup::RegexASTGroup( +template +RegexASTGroup::RegexASTGroup( RegexASTGroup const* left, - RegexASTLiteral const* right + RegexASTLiteral const* right ) { if (right == nullptr) { throw std::runtime_error("RegexASTGroup1: right == nullptr: A bracket expression in the " @@ -961,16 +961,16 @@ RegexASTGroup::RegexASTGroup( m_ranges.emplace_back(right->get_character(), right->get_character()); } -template -RegexASTGroup::RegexASTGroup(RegexASTGroup const* left, RegexASTGroup const* right) +template +RegexASTGroup::RegexASTGroup(RegexASTGroup const* left, RegexASTGroup const* right) : m_negate(left->m_negate), m_ranges(left->m_ranges) { assert(right->m_ranges.size() == 1); // Only add LiteralRange m_ranges.push_back(right->m_ranges[0]); } -template -RegexASTGroup::RegexASTGroup(RegexASTLiteral const* right) { +template +RegexASTGroup::RegexASTGroup(RegexASTLiteral const* right) { if (right == nullptr) { throw std::runtime_error("RegexASTGroup2: right == nullptr: A bracket expression in the " "schema contains illegal characters, remember to escape special " @@ -980,16 +980,16 @@ RegexASTGroup::RegexASTGroup(RegexASTLiteral const* m_ranges.emplace_back(right->get_character(), right->get_character()); } -template -RegexASTGroup::RegexASTGroup(RegexASTGroup const* right) : m_negate(false) { +template +RegexASTGroup::RegexASTGroup(RegexASTGroup const* right) : m_negate(false) { assert(right->m_ranges.size() == 1); // Only add LiteralRange m_ranges.push_back(right->m_ranges[0]); } -template -RegexASTGroup::RegexASTGroup( - RegexASTLiteral const* left, - RegexASTLiteral const* right +template +RegexASTGroup::RegexASTGroup( + RegexASTLiteral const* left, + RegexASTLiteral const* right ) { if (left == nullptr || right == nullptr) { throw std::runtime_error( @@ -1003,22 +1003,22 @@ RegexASTGroup::RegexASTGroup( m_ranges.emplace_back(left->get_character(), right->get_character()); } -template -RegexASTGroup::RegexASTGroup(std::vector const& literals) +template +RegexASTGroup::RegexASTGroup(std::vector const& literals) : m_negate(false) { for (uint32_t literal : literals) { m_ranges.emplace_back(literal, literal); } } -template -RegexASTGroup::RegexASTGroup(uint32_t min, uint32_t max) : m_negate(false) { +template +RegexASTGroup::RegexASTGroup(uint32_t min, uint32_t max) : m_negate(false) { m_ranges.emplace_back(min, max); } // ranges must be sorted -template -auto RegexASTGroup::merge(std::vector const& ranges) -> std::vector { +template +auto RegexASTGroup::merge(std::vector const& ranges) -> std::vector { std::vector merged_ranges; if (ranges.empty()) { return merged_ranges; @@ -1038,8 +1038,8 @@ auto RegexASTGroup::merge(std::vector const& ranges) -> std } // ranges must be sorted and non-overlapping -template -auto RegexASTGroup::complement(std::vector const& ranges +template +auto RegexASTGroup::complement(std::vector const& ranges ) -> std::vector { std::vector complemented; uint32_t low = 0; @@ -1055,8 +1055,8 @@ auto RegexASTGroup::complement(std::vector const& ranges return complemented; } -template -void RegexASTGroup::add_to_nfa(Nfa* nfa, NfaStateType* end_state) +template +void RegexASTGroup::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { // TODO: there should be a better way to do this with a set and keep m_ranges sorted, but we // have to consider removing overlap + taking the compliment. @@ -1071,8 +1071,8 @@ void RegexASTGroup::add_to_nfa(Nfa* nfa, NfaStateTyp } } -template -[[nodiscard]] auto RegexASTGroup::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTGroup::serialize() const -> std::u32string { std::u32string ranges_serialized; if (m_is_wildcard) { ranges_serialized += U"*"; @@ -1098,7 +1098,7 @@ template U"[{}{}]{}", m_negate ? U"^" : U"", ranges_serialized, - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } } // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/RegexDFAStateType.hpp b/src/log_surgeon/finite_automata/RegexDFAStateType.hpp index ae4e52d..8cb5ce0 100644 --- a/src/log_surgeon/finite_automata/RegexDFAStateType.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAStateType.hpp @@ -4,7 +4,7 @@ #include namespace log_surgeon::finite_automata { -enum class RegexDFAStateType : uint8_t { +enum class RegexTypedDfaState : uint8_t { Byte, UTF8 }; diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 4da9b5f..7c7b492 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -15,9 +15,9 @@ namespace log_surgeon::finite_automata { /** * Represents an NFA transition indicating that a capture group has been matched. * NOTE: `m_tag` is always expected to be non-null. - * @tparam NfaStateType Specifies the type of transition (bytes or UTF-8 characters). + * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ -template +template class PositiveTaggedTransition { public: /** @@ -25,18 +25,18 @@ class PositiveTaggedTransition { * @param dest_state * @throw std::invalid_argument if `tag` is `nullptr`. */ - PositiveTaggedTransition(Tag const* tag, NfaStateType const* dest_state) + PositiveTaggedTransition(Tag const* tag, TypedNfaState const* dest_state) : m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : tag}, m_dest_state{dest_state} {} - [[nodiscard]] auto get_dest_state() const -> NfaStateType const* { return m_dest_state; } + [[nodiscard]] auto get_dest_state() const -> TypedNfaState const* { return m_dest_state; } /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the positive tagged transition on success. * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ - [[nodiscard]] auto serialize(std::unordered_map const& state_ids + [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional { auto const state_id_it = state_ids.find(m_dest_state); if (state_id_it == state_ids.end()) { @@ -47,15 +47,15 @@ class PositiveTaggedTransition { private: Tag const* m_tag; - NfaStateType const* m_dest_state; + TypedNfaState const* m_dest_state; }; /** * Represents an NFA transition indicating that a capture group has been unmatched. * NOTE: All tags in `m_tags` are always expected to be non-null. - * @tparam NfaStateType Specifies the type of transition (bytes or UTF-8 characters). + * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ -template +template class NegativeTaggedTransition { public: /** @@ -63,7 +63,7 @@ class NegativeTaggedTransition { * @param dest_state * @throw std::invalid_argument if any elements in `tags` is `nullptr`. */ - NegativeTaggedTransition(std::vector tags, NfaStateType const* dest_state) + NegativeTaggedTransition(std::vector tags, TypedNfaState const* dest_state) : m_tags{[&tags] { if (std::ranges::any_of(tags, [](Tag const* tag) { return nullptr == tag; })) { throw std::invalid_argument("Tags cannot contain null elements"); @@ -72,14 +72,14 @@ class NegativeTaggedTransition { }()}, m_dest_state{dest_state} {} - [[nodiscard]] auto get_dest_state() const -> NfaStateType const* { return m_dest_state; } + [[nodiscard]] auto get_dest_state() const -> TypedNfaState const* { return m_dest_state; } /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the negative tagged transition on success. * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ - [[nodiscard]] auto serialize(std::unordered_map const& state_ids + [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional { auto const state_id_it = state_ids.find(m_dest_state); if (state_id_it == state_ids.end()) { @@ -93,7 +93,7 @@ class NegativeTaggedTransition { private: std::vector m_tags; - NfaStateType const* m_dest_state; + TypedNfaState const* m_dest_state; }; } // namespace log_surgeon::finite_automata From a6bbaeffb9d021bab022a5b12c906db542cf0e54 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 07:02:10 -0500 Subject: [PATCH 20/30] Rename to Utf8*State and Byte*State. --- src/log_surgeon/Lexer.hpp | 4 +-- src/log_surgeon/LogParser.cpp | 30 ++++++++++---------- src/log_surgeon/LogParser.hpp | 2 +- src/log_surgeon/SchemaParser.cpp | 20 ++++++------- src/log_surgeon/SchemaParser.hpp | 6 ++-- src/log_surgeon/finite_automata/DfaState.hpp | 4 +-- src/log_surgeon/finite_automata/NfaState.hpp | 6 ++-- 7 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 5d34315..fc408f2 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -163,8 +163,8 @@ class Lexer { }; namespace lexers { -using ByteLexer = Lexer; -using Utf8Lexer = Lexer; +using ByteLexer = Lexer; +using Utf8Lexer = Lexer; } // namespace lexers } // namespace log_surgeon diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index be680c5..89d3abf 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -17,8 +17,8 @@ using std::unique_ptr; using std::vector; namespace log_surgeon { -using finite_automata::DfaByteState; -using finite_automata::NfaByteState; +using finite_automata::ByteDfaState; +using finite_automata::ByteNfaState; using finite_automata::RegexAST; using finite_automata::RegexASTCat; using finite_automata::RegexASTGroup; @@ -62,24 +62,24 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { for (unique_ptr const& parser_ast : schema_ast->m_schema_vars) { auto* rule = dynamic_cast(parser_ast.get()); if (rule->m_name == "timestamp") { - unique_ptr> first_timestamp_regex_ast(rule->m_regex_ptr->clone() + unique_ptr> first_timestamp_regex_ast(rule->m_regex_ptr->clone() ); - unique_ptr> r1 - = make_unique>(utf8::cCharStartOfFile); + unique_ptr> r1 + = make_unique>(utf8::cCharStartOfFile); add_rule( "firstTimestamp", - make_unique>( + make_unique>( std::move(r1), std::move(first_timestamp_regex_ast) ) ); - unique_ptr> newline_timestamp_regex_ast(rule->m_regex_ptr->clone( + unique_ptr> newline_timestamp_regex_ast(rule->m_regex_ptr->clone( )); - unique_ptr> r2 - = make_unique>('\n'); + unique_ptr> r2 + = make_unique>('\n'); add_rule( "newLineTimestamp", - make_unique>( + make_unique>( std::move(r2), std::move(newline_timestamp_regex_ast) ) @@ -140,9 +140,9 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { } // For log-specific lexing: modify variable regex to contain a delimiter at the start. - unique_ptr> delimiter_group - = make_unique>(RegexASTGroup(delimiters)); - rule->m_regex_ptr = make_unique>( + unique_ptr> delimiter_group + = make_unique>(RegexASTGroup(delimiters)); + rule->m_regex_ptr = make_unique>( std::move(delimiter_group), std::move(rule->m_regex_ptr) ); @@ -193,7 +193,7 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { // make a message with just the '\n' character next_token.m_end_pos = next_token.m_start_pos + 1; next_token.m_type_ids_ptr - = &Lexer::cTokenUncaughtStringTypes; + = &Lexer::cTokenUncaughtStringTypes; output_buffer->set_token(1, next_token); output_buffer->set_pos(2); m_input_buffer.set_consumed_pos(next_token.m_start_pos); @@ -259,7 +259,7 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { Token curr_token = output_buffer->get_curr_token(); curr_token.m_end_pos = curr_token.m_start_pos + 1; curr_token.m_type_ids_ptr - = &Lexer::cTokenUncaughtStringTypes; + = &Lexer::cTokenUncaughtStringTypes; output_buffer->set_curr_token(curr_token); if (0 == m_start_of_log_message.m_start_pos) { m_input_buffer.set_consumed_pos(m_input_buffer.storage().size() - 1); diff --git a/src/log_surgeon/LogParser.hpp b/src/log_surgeon/LogParser.hpp index eef04a4..7605fe5 100644 --- a/src/log_surgeon/LogParser.hpp +++ b/src/log_surgeon/LogParser.hpp @@ -15,7 +15,7 @@ namespace log_surgeon { // TODO: Compare c-array vs. vectors (its underlying array) for buffers -class LogParser : public Parser { +class LogParser : public Parser { public: enum class ParsingAction { None, diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index b9f3bf6..3c7bd2c 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -16,25 +16,25 @@ #include using ParserValueRegex = log_surgeon::ParserValue>>; + log_surgeon::finite_automata::RegexAST>>; using RegexASTByte - = log_surgeon::finite_automata::RegexAST; + = log_surgeon::finite_automata::RegexAST; using RegexASTGroupByte - = log_surgeon::finite_automata::RegexASTGroup; + = log_surgeon::finite_automata::RegexASTGroup; using RegexASTIntegerByte - = log_surgeon::finite_automata::RegexASTInteger; + = log_surgeon::finite_automata::RegexASTInteger; using RegexASTLiteralByte - = log_surgeon::finite_automata::RegexASTLiteral; + = log_surgeon::finite_automata::RegexASTLiteral; using RegexASTMultiplicationByte = log_surgeon::finite_automata::RegexASTMultiplication< - log_surgeon::finite_automata::NfaByteState>; + log_surgeon::finite_automata::ByteNfaState>; using RegexASTOrByte - = log_surgeon::finite_automata::RegexASTOr; + = log_surgeon::finite_automata::RegexASTOr; using RegexASTCatByte - = log_surgeon::finite_automata::RegexASTCat; + = log_surgeon::finite_automata::RegexASTCat; using RegexASTCaptureByte - = log_surgeon::finite_automata::RegexASTCapture; + = log_surgeon::finite_automata::RegexASTCapture; using RegexASTEmptyByte - = log_surgeon::finite_automata::RegexASTEmpty; + = log_surgeon::finite_automata::RegexASTEmpty; using std::make_unique; using std::string; diff --git a/src/log_surgeon/SchemaParser.hpp b/src/log_surgeon/SchemaParser.hpp index 50ec3f0..36db611 100644 --- a/src/log_surgeon/SchemaParser.hpp +++ b/src/log_surgeon/SchemaParser.hpp @@ -46,7 +46,7 @@ class SchemaVarAST : public ParserAST { // Constructor SchemaVarAST( std::string name, - std::unique_ptr> regex_ptr, + std::unique_ptr> regex_ptr, uint32_t line_num ) : m_line_num(line_num), @@ -55,7 +55,7 @@ class SchemaVarAST : public ParserAST { uint32_t m_line_num; std::string m_name; - std::unique_ptr> m_regex_ptr; + std::unique_ptr> m_regex_ptr; }; class DelimiterStringAST : public ParserAST { @@ -69,7 +69,7 @@ class DelimiterStringAST : public ParserAST { }; class SchemaParser - : public Lalr1Parser { + : public Lalr1Parser { public: /** * File wrapper around generate_schema_ast() diff --git a/src/log_surgeon/finite_automata/DfaState.hpp b/src/log_surgeon/finite_automata/DfaState.hpp index ca34a2c..f25b25a 100644 --- a/src/log_surgeon/finite_automata/DfaState.hpp +++ b/src/log_surgeon/finite_automata/DfaState.hpp @@ -16,8 +16,8 @@ namespace log_surgeon::finite_automata { template class DfaState; -using DfaByteState = DfaState; -using DfaUtf8State = DfaState; +using ByteDfaState = DfaState; +using Utf8DfaState = DfaState; template class DfaState { diff --git a/src/log_surgeon/finite_automata/NfaState.hpp b/src/log_surgeon/finite_automata/NfaState.hpp index 5c10384..339f38f 100644 --- a/src/log_surgeon/finite_automata/NfaState.hpp +++ b/src/log_surgeon/finite_automata/NfaState.hpp @@ -20,8 +20,8 @@ namespace log_surgeon::finite_automata { template class NfaState; -using NfaByteState = NfaState; -using NfaUtf8State = NfaState; +using ByteNfaState = NfaState; +using Utf8NfaState = NfaState; template class NfaState { @@ -137,7 +137,7 @@ auto NfaState::add_interval(Interval interval, NfaState* dest_state) uint32_t overlap_low = std::max(data.m_interval.first, interval.first); uint32_t overlap_high = std::min(data.m_interval.second, interval.second); - std::vector tree_states = data.m_value; + std::vector tree_states = data.m_value; tree_states.push_back(dest_state); m_tree_transitions.insert(Interval(overlap_low, overlap_high), tree_states); if (data.m_interval.first < interval.first) { From ff2dac37420f774ec833eb9a593414c0c99f69cc Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 07:07:37 -0500 Subject: [PATCH 21/30] Remove RegexDFAStateType.hpp. --- .../finite_automata/RegexDFAStateType.hpp | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 src/log_surgeon/finite_automata/RegexDFAStateType.hpp diff --git a/src/log_surgeon/finite_automata/RegexDFAStateType.hpp b/src/log_surgeon/finite_automata/RegexDFAStateType.hpp deleted file mode 100644 index 8cb5ce0..0000000 --- a/src/log_surgeon/finite_automata/RegexDFAStateType.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE -#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE - -#include - -namespace log_surgeon::finite_automata { -enum class RegexTypedDfaState : uint8_t { - Byte, - UTF8 -}; -} // namespace log_surgeon::finite_automata - -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE From 7a8982d2321c476879d8642da55510bfe374c7ff Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 07:09:38 -0500 Subject: [PATCH 22/30] Linter. --- src/log_surgeon/Lexer.tpp | 5 ++--- src/log_surgeon/finite_automata/Dfa.hpp | 3 ++- src/log_surgeon/finite_automata/RegexAST.hpp | 6 ++++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index f3f3ef6..bcd3f9a 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -457,9 +457,8 @@ auto Lexer::nfa_to_dfa(finite_automata::Nfaget_tree_transitions().all()) { - for (TypedNfaState* const s1 : data.m_value) { - StateSet closure = epsilon_closure(s1); + for (const typename TypedNfaState::Tree::Data& data : s0->get_tree_transitions().all()) + { for (TypedNfaState* const s1 : data.m_value) { StateSet closure = epsilon_closure(s1); transitions_map[data.m_interval].insert(closure.begin(), closure.end()); } } diff --git a/src/log_surgeon/finite_automata/Dfa.hpp b/src/log_surgeon/finite_automata/Dfa.hpp index 8c7e5f9..531fb58 100644 --- a/src/log_surgeon/finite_automata/Dfa.hpp +++ b/src/log_surgeon/finite_automata/Dfa.hpp @@ -38,7 +38,8 @@ class Dfa { template template -auto Dfa::new_state(std::set const& nfa_state_set) -> TypedDfaState* { +auto Dfa::new_state(std::set const& nfa_state_set +) -> TypedDfaState* { m_states.emplace_back(std::make_unique()); auto* dfa_state = m_states.back().get(); for (auto const* nfa_state : nfa_state_set) { diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 010d533..7ff88bd 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -781,7 +781,8 @@ RegexASTOr::RegexASTOr( } template -void RegexASTOr::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { +void RegexASTOr::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) + const { m_left->add_to_nfa_with_negative_tags(nfa, end_state); m_right->add_to_nfa_with_negative_tags(nfa, end_state); } @@ -808,7 +809,8 @@ RegexASTCat::RegexASTCat( } template -void RegexASTCat::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { +void RegexASTCat::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) + const { TypedNfaState* saved_root = nfa->get_root(); TypedNfaState* intermediate_state = nfa->new_state(); m_left->add_to_nfa_with_negative_tags(nfa, intermediate_state); From 7b0a86c298117ddef1ccf39a67547b3e5951f6d2 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 07:11:46 -0500 Subject: [PATCH 23/30] Linter again. --- src/log_surgeon/Lexer.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index fc408f2..a392502 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -46,8 +46,10 @@ class Lexer { * @param id * @param regex */ - auto add_rule(uint32_t const& id, std::unique_ptr> rule) - -> void; + auto add_rule( + uint32_t const& id, + std::unique_ptr> rule + ) -> void; /** * Return regex pattern for a rule name From 26326167cc95227138d7a07367c546f2bfc8d92d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 07:48:17 -0500 Subject: [PATCH 24/30] Add missing alogrithm header; Update test-NFA. --- .../finite_automata/TaggedTransition.hpp | 1 + tests/test-NFA.cpp | 18 +++++++++--------- tests/test-lexer.cpp | 12 ++++++------ 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 7c7b492..43315b2 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -1,6 +1,7 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION #define LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION +#include #include #include #include diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 834e7fe..160d421 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -13,22 +13,22 @@ #include using log_surgeon::cSizeOfByte; -using log_surgeon::finite_automata::NfaByteState; +using log_surgeon::finite_automata::ByteNfaState; using log_surgeon::Schema; using log_surgeon::SchemaVarAST; using std::string; using std::stringstream; using std::vector; -using ByteLexicalRule = log_surgeon::LexicalRule; -using ByteNFA = log_surgeon::finite_automata::Nfa; -using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat; -using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture; -using RegexASTGroupByte = log_surgeon::finite_automata::RegexASTGroup; -using RegexASTLiteralByte = log_surgeon::finite_automata::RegexASTLiteral; +using ByteLexicalRule = log_surgeon::LexicalRule; +using ByteNFA = log_surgeon::finite_automata::Nfa; +using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat; +using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture; +using RegexASTGroupByte = log_surgeon::finite_automata::RegexASTGroup; +using RegexASTLiteralByte = log_surgeon::finite_automata::RegexASTLiteral; using RegexASTMultiplicationByte - = log_surgeon::finite_automata::RegexASTMultiplication; -using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr; + = log_surgeon::finite_automata::RegexASTMultiplication; +using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr; TEST_CASE("Test NFA", "[NFA]") { Schema schema; diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index a3ab69f..48b2185 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -19,17 +19,17 @@ using std::vector; using std::wstring_convert; using RegexASTCatByte - = log_surgeon::finite_automata::RegexASTCat; + = log_surgeon::finite_automata::RegexASTCat; using RegexASTCaptureByte - = log_surgeon::finite_automata::RegexASTCapture; + = log_surgeon::finite_automata::RegexASTCapture; using RegexASTGroupByte - = log_surgeon::finite_automata::RegexASTGroup; + = log_surgeon::finite_automata::RegexASTGroup; using RegexASTLiteralByte - = log_surgeon::finite_automata::RegexASTLiteral; + = log_surgeon::finite_automata::RegexASTLiteral; using RegexASTMultiplicationByte = log_surgeon::finite_automata::RegexASTMultiplication< - log_surgeon::finite_automata::NfaByteState>; + log_surgeon::finite_automata::ByteNfaState>; using RegexASTOrByte - = log_surgeon::finite_automata::RegexASTOr; + = log_surgeon::finite_automata::RegexASTOr; using log_surgeon::SchemaVarAST; namespace { From 992a2ec8a4ed054cc5cf31a2e25fe19a26c601ec Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 07:58:51 -0500 Subject: [PATCH 25/30] Update intersect-test.cpp with new names. --- examples/intersect-test.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index abf0b46..f218bd3 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -6,9 +6,9 @@ #include using log_surgeon::finite_automata::Dfa; -using log_surgeon::finite_automata::DfaByteState; +using log_surgeon::finite_automata::ByteDfaState; using log_surgeon::finite_automata::Nfa; -using log_surgeon::finite_automata::NfaByteState; +using log_surgeon::finite_automata::ByteNfaState; using log_surgeon::lexers::ByteLexer; using log_surgeon::LexicalRule; using log_surgeon::ParserAST; @@ -17,11 +17,11 @@ using std::string; using std::unique_ptr; using std::vector; -using ByteLexicalRule = log_surgeon::LexicalRule; +using ByteLexicalRule = log_surgeon::LexicalRule; auto get_intersect_for_query( std::map& m_id_symbol, - std::unique_ptr>& dfa1, + std::unique_ptr>& dfa1, std::string const& search_string ) -> void { std::string processed_search_string; @@ -40,7 +40,7 @@ auto get_intersect_for_query( auto* schema_var_ast = dynamic_cast(parser_ast.get()); rules.emplace_back(0, std::move(schema_var_ast->m_regex_ptr)); } - Nfa nfa(std::move(rules)); + Nfa nfa(std::move(rules)); auto dfa2 = ByteLexer::nfa_to_dfa(nfa); auto schema_types = dfa1->get_intersect(dfa2.get()); std::cout << search_string << ":"; @@ -78,7 +78,7 @@ auto main() -> int { rules.emplace_back(m_id_symbol.size(), std::move(var_ast->m_regex_ptr)); m_id_symbol[m_id_symbol.size()] = var_ast->m_name; } - Nfa nfa(std::move(rules)); + Nfa nfa(std::move(rules)); auto dfa = ByteLexer::nfa_to_dfa(nfa); get_intersect_for_query(m_id_symbol, dfa, "*1*"); get_intersect_for_query(m_id_symbol, dfa, "*a*"); From a5419f048f8605b517fa7298d50fe563bbf69dad Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 08:00:56 -0500 Subject: [PATCH 26/30] Linter on intersect-test.cpp. --- examples/intersect-test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index f218bd3..4c3bff8 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -5,10 +5,10 @@ #include #include -using log_surgeon::finite_automata::Dfa; using log_surgeon::finite_automata::ByteDfaState; -using log_surgeon::finite_automata::Nfa; using log_surgeon::finite_automata::ByteNfaState; +using log_surgeon::finite_automata::Dfa; +using log_surgeon::finite_automata::Nfa; using log_surgeon::lexers::ByteLexer; using log_surgeon::LexicalRule; using log_surgeon::ParserAST; From 9153a7cace247ac41df235f4e2a6804c19f62302 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 9 Dec 2024 10:56:50 -0500 Subject: [PATCH 27/30] Switch this->m_lexer to m_lexer by using Parser::m_lexer. --- src/log_surgeon/Lalr1Parser.hpp | 2 ++ src/log_surgeon/Lalr1Parser.tpp | 46 ++++++++++++++++----------------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/src/log_surgeon/Lalr1Parser.hpp b/src/log_surgeon/Lalr1Parser.hpp index f3c6ee6..1580168 100644 --- a/src/log_surgeon/Lalr1Parser.hpp +++ b/src/log_surgeon/Lalr1Parser.hpp @@ -394,6 +394,8 @@ class Lalr1Parser : public Parser { auto symbol_is_token(uint32_t s) -> bool { return m_terminals.find(s) != m_terminals.end(); } + using Parser::m_lexer; + std::set m_terminals; std::set m_nullable; std::map, std::unique_ptr> m_lr0_item_sets; diff --git a/src/log_surgeon/Lalr1Parser.tpp b/src/log_surgeon/Lalr1Parser.tpp index dd00d17..0def510 100644 --- a/src/log_surgeon/Lalr1Parser.tpp +++ b/src/log_surgeon/Lalr1Parser.tpp @@ -71,7 +71,7 @@ void Lalr1Parser::add_rule( std::unique_ptr> rule ) { Parser::add_rule(name, std::move(rule)); - m_terminals.insert(this->m_lexer.m_symbol_id[name]); + m_terminals.insert(m_lexer.m_symbol_id[name]); } template @@ -115,9 +115,9 @@ auto Lalr1Parser::add_production( std::vector const& body, SemanticRule semantic_rule ) -> uint32_t { - if (this->m_lexer.m_symbol_id.find(head) == this->m_lexer.m_symbol_id.end()) { - this->m_lexer.m_symbol_id[head] = this->m_lexer.m_symbol_id.size(); - this->m_lexer.m_id_symbol[this->m_lexer.m_symbol_id[head]] = head; + if (m_lexer.m_symbol_id.find(head) == m_lexer.m_symbol_id.end()) { + m_lexer.m_symbol_id[head] = m_lexer.m_symbol_id.size(); + m_lexer.m_id_symbol[m_lexer.m_symbol_id[head]] = head; } uint32_t n = m_productions.size(); auto it = m_productions_map.find(head); @@ -131,13 +131,13 @@ auto Lalr1Parser::add_production( } std::unique_ptr p(new Production); p->m_index = n; - p->m_head = this->m_lexer.m_symbol_id[head]; + p->m_head = m_lexer.m_symbol_id[head]; for (std::string const& symbol_string : body) { - if (this->m_lexer.m_symbol_id.find(symbol_string) == this->m_lexer.m_symbol_id.end()) { - this->m_lexer.m_symbol_id[symbol_string] = this->m_lexer.m_symbol_id.size(); - this->m_lexer.m_id_symbol[this->m_lexer.m_symbol_id[symbol_string]] = symbol_string; + if (m_lexer.m_symbol_id.find(symbol_string) == m_lexer.m_symbol_id.end()) { + m_lexer.m_symbol_id[symbol_string] = m_lexer.m_symbol_id.size(); + m_lexer.m_id_symbol[m_lexer.m_symbol_id[symbol_string]] = symbol_string; } - p->m_body.push_back(this->m_lexer.m_symbol_id[symbol_string]); + p->m_body.push_back(m_lexer.m_symbol_id[symbol_string]); } p->m_semantic_rule = std::move(semantic_rule); m_non_terminals.insert(std::pair>(p->m_head, {})); @@ -152,7 +152,7 @@ auto Lalr1Parser::add_production( template void Lalr1Parser::generate() { - this->m_lexer.generate(); + m_lexer.generate(); assert(!m_productions.empty()); generate_lr0_kernels(); generate_first_sets(); @@ -435,7 +435,7 @@ void Lalr1Parser::generate_lalr1_action() { for (std::map, std::unique_ptr>::value_type const& kv : m_lr1_item_sets) { ItemSet* item_set_ptr = kv.second.get(); - item_set_ptr->m_actions.resize(this->m_lexer.m_symbol_id.size(), false); + item_set_ptr->m_actions.resize(m_lexer.m_symbol_id.size(), false); for (Item const& item : item_set_ptr->m_closure) { if (!item.has_dot_at_end()) { if (m_terminals.find(item.next_symbol()) == m_terminals.end() @@ -453,7 +453,7 @@ void Lalr1Parser::generate_lalr1_action() { } std::string conflict_msg{}; conflict_msg += "For symbol "; - conflict_msg += this->m_lexer.m_id_symbol[item.next_symbol()]; + conflict_msg += m_lexer.m_id_symbol[item.next_symbol()]; conflict_msg += ", adding shift to "; conflict_msg += std::to_string(item_set_ptr->m_next[item.next_symbol()]->m_index); @@ -465,10 +465,10 @@ void Lalr1Parser::generate_lalr1_action() { } else { conflict_msg += "shift-reduce conflict with reduction "; conflict_msg - += this->m_lexer.m_id_symbol[std::get(action)->m_head]; + += m_lexer.m_id_symbol[std::get(action)->m_head]; conflict_msg += "-> {"; for (uint32_t symbol : std::get(action)->m_body) { - conflict_msg += this->m_lexer.m_id_symbol[symbol] + ","; + conflict_msg += m_lexer.m_id_symbol[symbol] + ","; } conflict_msg += "}\n"; } @@ -486,12 +486,12 @@ void Lalr1Parser::generate_lalr1_action() { if (!std::holds_alternative(action)) { std::string conflict_msg{}; conflict_msg += "For symbol "; - conflict_msg += this->m_lexer.m_id_symbol[item.m_lookahead]; + conflict_msg += m_lexer.m_id_symbol[item.m_lookahead]; conflict_msg += ", adding reduction "; - conflict_msg += this->m_lexer.m_id_symbol[item.m_production->m_head]; + conflict_msg += m_lexer.m_id_symbol[item.m_production->m_head]; conflict_msg += "-> {"; for (uint32_t symbol : item.m_production->m_body) { - conflict_msg += this->m_lexer.m_id_symbol[symbol] + ","; + conflict_msg += m_lexer.m_id_symbol[symbol] + ","; } conflict_msg += "} causes "; if (std::holds_alternative(action)) { @@ -501,11 +501,11 @@ void Lalr1Parser::generate_lalr1_action() { } else { conflict_msg += "reduce-reduce conflict with reduction "; conflict_msg - += this->m_lexer + += m_lexer .m_id_symbol[std::get(action)->m_head]; conflict_msg += "-> {"; for (uint32_t symbol : std::get(action)->m_body) { - conflict_msg += this->m_lexer.m_id_symbol[symbol] + ","; + conflict_msg += m_lexer.m_id_symbol[symbol] + ","; } conflict_msg += "}\n"; } @@ -605,12 +605,12 @@ auto Lalr1Parser::report_error() -> std::string { error_type += "'"; if (auto* regex_ast_literal = dynamic_cast*>( - this->m_lexer.get_rule(i) + m_lexer.get_rule(i) )) { error_type += unescape(char(regex_ast_literal->get_character())); } else { - error_type += this->m_lexer.m_id_symbol[i]; + error_type += m_lexer.m_id_symbol[i]; } error_type += "',"; } @@ -660,14 +660,14 @@ void Lalr1Parser::reset() { m_parse_stack_matches.pop(); } m_input_buffer.reset(); - this->m_lexer.reset(); + m_lexer.reset(); } template auto Lalr1Parser::get_next_symbol() -> Token { if (m_next_token == std::nullopt) { Token token; - if (ErrorCode error = this->m_lexer.scan(m_input_buffer, token); + if (ErrorCode error = m_lexer.scan(m_input_buffer, token); ErrorCode::Success != error) { throw std::runtime_error("Error scanning in lexer."); From fa0c098d8777c742acbb2be31189f9a79849f6cc Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 9 Dec 2024 10:58:38 -0500 Subject: [PATCH 28/30] Linter. --- src/log_surgeon/Lalr1Parser.tpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/log_surgeon/Lalr1Parser.tpp b/src/log_surgeon/Lalr1Parser.tpp index 0def510..79bfc47 100644 --- a/src/log_surgeon/Lalr1Parser.tpp +++ b/src/log_surgeon/Lalr1Parser.tpp @@ -464,8 +464,7 @@ void Lalr1Parser::generate_lalr1_action() { conflict_msg += "\n"; } else { conflict_msg += "shift-reduce conflict with reduction "; - conflict_msg - += m_lexer.m_id_symbol[std::get(action)->m_head]; + conflict_msg += m_lexer.m_id_symbol[std::get(action)->m_head]; conflict_msg += "-> {"; for (uint32_t symbol : std::get(action)->m_body) { conflict_msg += m_lexer.m_id_symbol[symbol] + ","; @@ -501,8 +500,7 @@ void Lalr1Parser::generate_lalr1_action() { } else { conflict_msg += "reduce-reduce conflict with reduction "; conflict_msg - += m_lexer - .m_id_symbol[std::get(action)->m_head]; + += m_lexer.m_id_symbol[std::get(action)->m_head]; conflict_msg += "-> {"; for (uint32_t symbol : std::get(action)->m_body) { conflict_msg += m_lexer.m_id_symbol[symbol] + ","; @@ -667,9 +665,7 @@ template auto Lalr1Parser::get_next_symbol() -> Token { if (m_next_token == std::nullopt) { Token token; - if (ErrorCode error = m_lexer.scan(m_input_buffer, token); - ErrorCode::Success != error) - { + if (ErrorCode error = m_lexer.scan(m_input_buffer, token); ErrorCode::Success != error) { throw std::runtime_error("Error scanning in lexer."); } return token; From 2dd6f45ac9800dde7ef457102eec20e1030ebe11 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 9 Dec 2024 11:39:36 -0500 Subject: [PATCH 29/30] Lint. --- src/log_surgeon/Lalr1Parser.tpp | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/log_surgeon/Lalr1Parser.tpp b/src/log_surgeon/Lalr1Parser.tpp index 65d458a..f7021a2 100644 --- a/src/log_surgeon/Lalr1Parser.tpp +++ b/src/log_surgeon/Lalr1Parser.tpp @@ -92,11 +92,10 @@ auto Lalr1Parser::add_token_chain( = std::make_unique>(chain[0]); auto second_char_rule = std::make_unique>(chain[1]); - auto rule_chain - = std::make_unique>( - std::move(first_char_rule), - std::move(second_char_rule) - ); + auto rule_chain = std::make_unique>( + std::move(first_char_rule), + std::move(second_char_rule) + ); for (uint32_t i = 2; i < chain.size(); i++) { auto next_char = chain[i]; auto next_char_rule @@ -161,7 +160,7 @@ auto Lalr1Parser::generate() -> void { } template -auto Lalr1Parser::generate_lr0_kernels() -> void{ +auto Lalr1Parser::generate_lr0_kernels() -> void { auto* root_production_ptr = m_productions[m_root_production_id].get(); Item root_item(root_production_ptr, 0, cNullSymbol); auto item_set0 = std::make_unique(); @@ -211,7 +210,8 @@ auto Lalr1Parser::lr_closure_helper( } template -auto Lalr1Parser::generate_lr0_closure(ItemSet* item_set_ptr) -> void { +auto Lalr1Parser::generate_lr0_closure(ItemSet* item_set_ptr +) -> void { std::deque q( item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end() @@ -300,8 +300,7 @@ auto Lalr1Parser::generate_first_sets() -> void { template auto Lalr1Parser::generate_lr1_item_sets() -> void { - for (auto const& kv : m_lr0_item_sets) - { + for (auto const& kv : m_lr0_item_sets) { for (auto const& l0_item : kv.second->m_kernel) { ItemSet temp_item_set; temp_item_set.m_kernel.insert(l0_item); @@ -319,8 +318,7 @@ auto Lalr1Parser::generate_lr1_item_sets() -> void } } std::map> lookaheads; - for (auto const& kv : m_lr0_item_sets) - { + for (auto const& kv : m_lr0_item_sets) { for (auto const& l0_item : kv.second->m_kernel) { lookaheads[l0_item].insert( m_spontaneous_map[l0_item.m_production].begin(), @@ -347,8 +345,7 @@ auto Lalr1Parser::generate_lr1_item_sets() -> void } } } - for (auto const& kv : m_lr0_item_sets) - { + for (auto const& kv : m_lr0_item_sets) { auto lr1_item_set_ptr = std::make_unique(); for (auto const& l0_item : kv.second->m_kernel) { for (auto const& lookahead : lookaheads[l0_item]) { @@ -379,7 +376,8 @@ auto Lalr1Parser::generate_lr1_item_sets() -> void } template -auto Lalr1Parser::generate_lr1_closure(ItemSet* item_set_ptr) -> void { +auto Lalr1Parser::generate_lr1_closure(ItemSet* item_set_ptr +) -> void { std::deque queue(item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end()); while (!queue.empty()) { auto item = queue.back(); @@ -612,8 +610,8 @@ auto Lalr1Parser::report_error() -> std::string { error_type += " before '" + unescape(token.to_string()[0]) + "' token"; } auto error_string = "Schema:" + std::to_string(line_num + 1) + ":" - + std::to_string(consumed_input.size() + 1) - + ": error: " + error_type + "\n"; + + std::to_string(consumed_input.size() + 1) + ": error: " + error_type + + "\n"; for (int i = 0; i < 10; i++) { error_string += " "; } @@ -735,8 +733,7 @@ auto Lalr1Parser::parse_symbol( = reduce->m_semantic_rule(&matched_non_terminal); } auto* curr = m_parse_stack_states.top(); - auto const& it - = curr->m_actions[matched_non_terminal.m_production->m_head]; + auto const& it = curr->m_actions[matched_non_terminal.m_production->m_head]; m_parse_stack_states.push(std::get(it)); m_parse_stack_matches.emplace(std::move(matched_non_terminal)); ret = true; From ee46719d4a7f3bb8302c1b7142d02948250bd5ad Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 9 Dec 2024 17:18:54 -0500 Subject: [PATCH 30/30] More auto changes. --- src/log_surgeon/Lexer.tpp | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 84d77da..c6bfdba 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -173,7 +173,7 @@ auto Lexer::scan_with_wildcard( char wildcard, Token& token ) -> ErrorCode { - TypedDfaState const* state = m_dfa->get_root(); + auto const* state = m_dfa->get_root(); if (m_asked_for_more_data) { state = m_prev_state; m_asked_for_more_data = false; @@ -197,7 +197,7 @@ auto Lexer::scan_with_wildcard( m_type_ids = nullptr; } while (true) { - uint32_t prev_byte_buf_pos = input_buffer.storage().pos(); + auto prev_byte_buf_pos = input_buffer.storage().pos(); unsigned char next_char{utf8::cCharErr}; if (ErrorCode err = input_buffer.get_next_character(next_char); ErrorCode::Success != err) { m_asked_for_more_data = true; @@ -240,7 +240,7 @@ auto Lexer::scan_with_wildcard( // BFS (keep track of m_type_ids) if (wildcard == '?') { for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { - TypedDfaState* next_state = state->next(byte); + auto* next_state = state->next(byte); if (next_state->is_accepting() == false) { token = Token{m_last_match_pos, @@ -304,7 +304,7 @@ template auto Lexer::increase_buffer_capacity(ParserInputBuffer& input_buffer ) -> void { uint32_t old_storage_size{0}; - bool flipped_static_buffer{false}; + auto flipped_static_buffer{false}; input_buffer.increase_capacity(old_storage_size, flipped_static_buffer); if (old_storage_size < input_buffer.storage().size()) { if (flipped_static_buffer) { @@ -346,10 +346,10 @@ template void Lexer::add_delimiters(std::vector const& delimiters) { assert(!delimiters.empty()); m_has_delimiters = true; - for (bool& i : m_is_delimiter) { + for (auto& i : m_is_delimiter) { i = false; } - for (uint32_t delimiter : delimiters) { + for (auto delimiter : delimiters) { m_is_delimiter[delimiter] = true; } m_is_delimiter[utf8::cCharStartOfFile] = true; @@ -379,7 +379,7 @@ void Lexer::generate() { finite_automata::Nfa nfa{std::move(m_rules)}; // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" m_dfa = nfa_to_dfa(nfa); - TypedDfaState const* state = m_dfa->get_root(); + auto const* state = m_dfa->get_root(); for (uint32_t i = 0; i < cSizeOfByte; i++) { if (state->next(i) != nullptr) { m_is_first_char[i] = true; @@ -430,23 +430,22 @@ template auto Lexer::nfa_to_dfa(finite_automata::Nfa& nfa ) -> std::unique_ptr> { typedef std::set StateSet; - std::unique_ptr> dfa - = std::make_unique>(); + auto dfa = std::make_unique>(); std::map dfa_states; std::stack unmarked_sets; auto create_dfa_state = [&dfa, &dfa_states, &unmarked_sets](StateSet const& set) -> TypedDfaState* { - TypedDfaState* state = dfa->new_state(set); + auto* state = dfa->new_state(set); dfa_states[set] = state; unmarked_sets.push(set); return state; }; - StateSet start_set = epsilon_closure(nfa.get_root()); + auto start_set = epsilon_closure(nfa.get_root()); create_dfa_state(start_set); while (!unmarked_sets.empty()) { - StateSet set = unmarked_sets.top(); + auto set = unmarked_sets.top(); unmarked_sets.pop(); - TypedDfaState* dfa_state = dfa_states.at(set); + auto* dfa_state = dfa_states.at(set); std::map ascii_transitions_map; // map transitions_map; for (TypedNfaState const* s0 : set) { @@ -477,7 +476,7 @@ auto Lexer::nfa_to_dfa(finite_automata::Nfa::value_type const& kv : ascii_transitions_map) { - TypedDfaState* dest_state = next_dfa_state(kv.second); + auto* dest_state = next_dfa_state(kv.second); dfa_state->add_byte_transition(kv.first, dest_state); } // TODO: add this for the utf8 case