diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ad59f75..ccdd5eb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,20 @@ set(SOURCE_FILES src/log_surgeon/Constants.hpp src/log_surgeon/FileReader.cpp src/log_surgeon/FileReader.hpp + src/log_surgeon/finite_automata/Capture.hpp + src/log_surgeon/finite_automata/Dfa.hpp + src/log_surgeon/finite_automata/DfaState.hpp + src/log_surgeon/finite_automata/DfaStatePair.hpp + src/log_surgeon/finite_automata/Nfa.hpp + src/log_surgeon/finite_automata/NfaState.hpp + src/log_surgeon/finite_automata/PrefixTree.cpp + src/log_surgeon/finite_automata/PrefixTree.hpp + src/log_surgeon/finite_automata/RegexAST.hpp + src/log_surgeon/finite_automata/RegisterHandler.hpp + src/log_surgeon/finite_automata/StateType.hpp + src/log_surgeon/finite_automata/TaggedTransition.hpp + src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp + src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp src/log_surgeon/Lalr1Parser.cpp src/log_surgeon/Lalr1Parser.hpp src/log_surgeon/Lalr1Parser.tpp @@ -93,20 +107,7 @@ set(SOURCE_FILES src/log_surgeon/SchemaParser.hpp src/log_surgeon/Token.cpp src/log_surgeon/Token.hpp - src/log_surgeon/finite_automata/PrefixTree.cpp - src/log_surgeon/finite_automata/PrefixTree.hpp - src/log_surgeon/finite_automata/RegexAST.hpp - src/log_surgeon/finite_automata/Dfa.hpp - src/log_surgeon/finite_automata/DfaState.hpp - src/log_surgeon/finite_automata/DfaStatePair.hpp - src/log_surgeon/finite_automata/Nfa.hpp - src/log_surgeon/finite_automata/NfaState.hpp - src/log_surgeon/finite_automata/RegisterHandler.hpp - src/log_surgeon/finite_automata/StateType.hpp - src/log_surgeon/finite_automata/Tag.hpp - src/log_surgeon/finite_automata/TaggedTransition.hpp - src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp - src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp + src/log_surgeon/UniqueIdGenerator.hpp ) set(LCHIP_INSTALL_CONFIG_DIR ${CMAKE_INSTALL_LIBDIR}/cmake/log_surgeon) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 4f68a168..d99f94f0 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,10 @@ namespace log_surgeon { template class Lexer { public: + using register_id_t = finite_automata::RegisterHandler::register_id_t; + using symbol_id_t = uint32_t; + using tag_id_t = finite_automata::tag_id_t; + static inline std::vector const cTokenEndTypes = {(uint32_t)SymbolId::TokenEnd}; static inline std::vector const cTokenUncaughtStringTypes = {(uint32_t)SymbolId::TokenUncaughtString}; @@ -51,7 +56,8 @@ class Lexer { auto get_rule(uint32_t variable_id) -> finite_automata::RegexAST*; /** - * Generate DFA for lexer + * Generate DFA for lexer. + * @throw std::invalid_argument if `m_rules` contains multipe captures with the same name. */ auto generate() -> void; @@ -122,8 +128,48 @@ class Lexer { return m_dfa; } - std::unordered_map m_symbol_id; - std::unordered_map m_id_symbol; + [[nodiscard]] auto get_capture_ids_for_var_id(symbol_id_t const var_id + ) const -> std::optional> { + auto const capture_ids{m_var_id_to_capture_ids.find(var_id)}; + if (m_var_id_to_capture_ids.end() == capture_ids) { + return std::nullopt; + } + return capture_ids->second; + } + + [[nodiscard]] auto get_tag_ids_for_capture_id(symbol_id_t const capture_id + ) const -> std::optional> { + auto const tag_ids{m_capture_id_to_tag_ids.find(capture_id)}; + if (m_capture_id_to_tag_ids.end() == tag_ids) { + return std::nullopt; + } + return tag_ids->second; + } + + [[nodiscard]] auto get_register_for_tag_id(tag_id_t const tag_id + ) const -> std::optional { + auto const it{m_tag_to_register_id.find(tag_id)}; + if (m_tag_to_register_id.end() == it) { + return std::nullopt; + } + return it->second; + } + + [[nodiscard]] auto get_registers_for_capture(symbol_id_t capture_id + ) const -> std::optional> { + auto const tag_ids{get_tag_ids_for_capture_id(capture_id)}; + if (tag_ids.has_value()) { + auto const start_reg{get_register_for_tag_id(tag_ids.value().first())}; + auto const end_reg{get_register_for_tag_id(tag_ids.value().second())}; + if (start_reg.has_value() && end_reg.has_value()) { + return std::make_pair(start_reg.value(), end_reg.value()); + } + } + return std::nullopt; + } + + std::unordered_map m_symbol_id; + std::unordered_map m_id_symbol; private: /** @@ -148,6 +194,9 @@ class Lexer { std::unique_ptr> m_dfa; bool m_asked_for_more_data{false}; TypedDfaState const* m_prev_state{nullptr}; + std::unordered_map> m_var_id_to_capture_ids; + std::unordered_map> m_capture_id_to_tag_ids; + std::unordered_map m_tag_to_register_id; }; namespace lexers { diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index a4e36f55..2e91575c 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -358,17 +359,17 @@ void Lexer::add_delimiters(std::vector c template void Lexer::add_rule( - uint32_t const& id, + symbol_id_t const& var_id, std::unique_ptr> rule ) { - m_rules.emplace_back(id, std::move(rule)); + m_rules.emplace_back(var_id, std::move(rule)); } template -auto Lexer::get_rule(uint32_t const variable_id +auto Lexer::get_rule(symbol_id_t const var_id ) -> finite_automata::RegexAST* { for (auto const& rule : m_rules) { - if (rule.get_variable_id() == variable_id) { + if (rule.get_variable_id() == var_id) { return rule.get_regex(); } } @@ -377,8 +378,30 @@ auto Lexer::get_rule(uint32_t const variable_id template void Lexer::generate() { - finite_automata::Nfa nfa{std::move(m_rules)}; - // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" + for (auto const& rule : m_rules) { + for (auto* capture : rule.get_captures()) { + std::string const capture_name{capture->get_name()}; + symbol_id_t capture_id{0}; + if (m_symbol_id.find(capture_name) == m_symbol_id.end()) { + capture_id = m_symbol_id.size(); + m_symbol_id[capture_name] = capture_id; + m_id_symbol[capture_id] = capture_name; + } else { + throw std::invalid_argument("`m_rules` contains capture names that are not unique." + ); + } + m_var_id_to_capture_ids[rule.get_variable_id()].push_back(capture_id); + } + } + + finite_automata::Nfa nfa{m_rules}; + for (auto const& [capture, tag_ids] : nfa.get_capture_to_tag_ids()) { + std::string capture_name{capture->get_name()}; + auto capture_id{m_symbol_id[capture_name]}; + m_capture_id_to_tag_ids.emplace(capture_id, tag_ids); + } + + // TODO: DFA ignores captures. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" m_dfa = std::make_unique>(std::move(nfa)); auto const* state = m_dfa->get_root(); for (uint32_t i = 0; i < cSizeOfByte; i++) { diff --git a/src/log_surgeon/LexicalRule.hpp b/src/log_surgeon/LexicalRule.hpp index 6ab7e861..c81456be 100644 --- a/src/log_surgeon/LexicalRule.hpp +++ b/src/log_surgeon/LexicalRule.hpp @@ -23,6 +23,10 @@ class LexicalRule { */ auto add_to_nfa(finite_automata::Nfa* nfa) const -> void; + [[nodiscard]] auto get_captures() const -> std::vector { + return m_regex->get_subtree_positive_captures(); + } + [[nodiscard]] auto get_variable_id() const -> uint32_t { return m_variable_id; } [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index d36271ca..1960e997 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -9,8 +9,8 @@ #include #include +#include #include -#include #include #include #include @@ -167,7 +167,7 @@ static auto regex_capture_rule(NonTerminal const* m) -> std::unique_ptrnon_terminal_cast(5)->get_parser_ast()->get>(); return std::make_unique(make_unique( std::move(r6), - std::make_unique(r4->m_name) + std::make_unique(r4->m_name) )); } @@ -202,7 +202,7 @@ static auto regex_or_rule(NonTerminal* m) -> unique_ptr { static auto regex_match_zero_or_more_rule(NonTerminal* m) -> unique_ptr { auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); - // To handle negative tags we treat `R*` as `R+ | ∅`. + // To handle negative captures we treat `R*` as `R+ | ∅`. return make_unique(make_unique( make_unique(), make_unique(std::move(r1), 1, 0) @@ -248,7 +248,7 @@ static auto regex_match_range_rule(NonTerminal* m) -> unique_ptr { auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); if (0 == min) { - // To handle negative tags we treat `R*` as `R+ | ∅`. + // To handle negative captures we treat `R*` as `R+ | ∅`. return make_unique(make_unique( make_unique(), make_unique(std::move(r1), 1, max) diff --git a/src/log_surgeon/UniqueIdGenerator.hpp b/src/log_surgeon/UniqueIdGenerator.hpp new file mode 100644 index 00000000..47ab1490 --- /dev/null +++ b/src/log_surgeon/UniqueIdGenerator.hpp @@ -0,0 +1,16 @@ +#ifndef LOG_SURGEON_UNIQUEIDGENERATOR_HPP +#define LOG_SURGEON_UNIQUEIDGENERATOR_HPP + +namespace log_surgeon { +class UniqueIdGenerator { +public: + UniqueIdGenerator() : current_id{0} {} + + [[nodiscard]] auto generate_id() -> uint32_t { return current_id++; } + +private: + uint32_t current_id; +}; +} // namespace log_surgeon + +#endif // LOG_SURGEON_UNIQUEIDGENERATOR_HPP diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Capture.hpp similarity index 55% rename from src/log_surgeon/finite_automata/Tag.hpp rename to src/log_surgeon/finite_automata/Capture.hpp index 3a3b4d7f..84480eab 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Capture.hpp @@ -1,14 +1,14 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_TAG -#define LOG_SURGEON_FINITE_AUTOMATA_TAG +#ifndef LOG_SURGEON_FINITE_AUTOMATA_CAPTURE +#define LOG_SURGEON_FINITE_AUTOMATA_CAPTURE #include #include #include namespace log_surgeon::finite_automata { -class Tag { +class Capture { public: - explicit Tag(std::string name) : m_name{std::move(name)} {} + explicit Capture(std::string name) : m_name{std::move(name)} {} [[nodiscard]] auto get_name() const -> std::string_view { return m_name; } @@ -17,4 +17,4 @@ class Tag { }; } // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_TAG +#endif // LOG_SURGEON_FINITE_AUTOMATA_CAPTURE diff --git a/src/log_surgeon/finite_automata/Dfa.hpp b/src/log_surgeon/finite_automata/Dfa.hpp index baceaec3..ecafa7e8 100644 --- a/src/log_surgeon/finite_automata/Dfa.hpp +++ b/src/log_surgeon/finite_automata/Dfa.hpp @@ -2,12 +2,16 @@ #define LOG_SURGEON_FINITE_AUTOMATA_DFA_HPP #include +#include #include #include +#include #include +#include #include #include +#include namespace log_surgeon::finite_automata { template @@ -38,6 +42,7 @@ class Dfa { private: std::vector> m_states; + RegisterHandler m_register_handler; }; template @@ -74,7 +79,7 @@ Dfa::Dfa(Nfa nfa) { } auto next_dfa_state = [&dfa_states, &create_dfa_state](StateSet const& set) -> TypedDfaState* { - TypedDfaState* state; + TypedDfaState* state{nullptr}; auto it = dfa_states.find(set); if (it == dfa_states.end()) { state = create_dfa_state(set); diff --git a/src/log_surgeon/finite_automata/Nfa.hpp b/src/log_surgeon/finite_automata/Nfa.hpp index 8eaaaadd..30d0266c 100644 --- a/src/log_surgeon/finite_automata/Nfa.hpp +++ b/src/log_surgeon/finite_automata/Nfa.hpp @@ -15,14 +15,23 @@ #include #include #include +#include namespace log_surgeon::finite_automata { +/** + * Represents a NFA(non-deterministic finite automata) for recognizing a language based on the set + * of rules used during initialization. Currently use as an intermediate model for generating the + * DFA. + * + * Currently we assume all capture groups have unique names. + * @tparam TypedNfaState + */ template class Nfa { public: using StateVec = std::vector; - explicit Nfa(std::vector> rules); + explicit Nfa(std::vector> const& rules); /** * Creates a unique_ptr for an NFA state with no tagged transitions and adds it to `m_states`. @@ -30,40 +39,28 @@ class Nfa { */ [[nodiscard]] auto new_state() -> TypedNfaState*; - /** - * Creates a unique_ptr for an NFA state with a positive tagged end transition and adds it to - * `m_states`. - * @param tag - * @param dest_state - * @return A new state with a positive tagged end transition to `dest_state`. - */ - [[nodiscard]] auto new_state_with_positive_tagged_end_transition( - Tag const* tag, - TypedNfaState const* dest_state - ) -> TypedNfaState*; - /** * Creates a unique_ptr for an NFA state with a negative tagged transition and adds it to * `m_states`. - * @param tags + * @param captures * @param dest_state * @return TypedNfaState* */ [[nodiscard]] auto new_state_with_negative_tagged_transition( - std::vector tags, + std::vector const& captures, TypedNfaState const* dest_state ) -> TypedNfaState*; /** * Creates the start and end states for a capture group. - * @param tag The tag associated with the capture group. + * @param capture The capture associated with the capture group. * @param dest_state * @return A pair of states: * - A new state with a positive tagged start transition from `m_root`. * - A new state with a positive tagged end transition to `dest_state`. */ [[nodiscard]] auto new_start_and_end_states_with_positive_tagged_transitions( - Tag const* tag, + Capture const* capture, TypedNfaState const* dest_state ) -> std::pair; @@ -86,23 +83,61 @@ class Nfa { auto get_root() -> TypedNfaState* { return m_root; } + [[nodiscard]] auto get_capture_to_tag_ids( + ) const -> std::unordered_map> { + return m_capture_to_tag_ids; + } + private: + /** + * Creates start and end tags for the specified capture if they don't currently exist. + * @param capture + * @return The start and end tags corresponding to `capture`. + */ + auto get_or_create_capture_tags(Capture const* capture) -> std::pair; + + /** + * Creates a `unique_ptr` for an NFA state with a positive tagged end transition and adds it to + * `m_states`. + * @param tag_id + * @param dest_state + * @return A new state with a positive tagged end transition to `dest_state`. + */ + [[nodiscard]] auto new_state_with_positive_tagged_end_transition( + tag_id_t tag_id, + TypedNfaState const* dest_state + ) -> TypedNfaState*; + std::vector> m_states; + // TODO: Lexer currently enforces unique naming across capture groups. However, this limits use + // cases. Possibly initialize this in the lexer and pass it in during construction. + std::unordered_map> m_capture_to_tag_ids; TypedNfaState* m_root; - // Store the rules locally as they contain information needed by the NFA. E.g., transitions in - // the NFA point to tags in the rule ASTs. - std::vector> m_rules; + UniqueIdGenerator m_unique_id_generator; }; template -Nfa::Nfa(std::vector> rules) - : m_root{new_state()}, - m_rules{std::move(rules)} { - for (auto const& rule : m_rules) { +Nfa::Nfa(std::vector> const& rules) + : m_root{new_state()} { + for (auto const& rule : rules) { rule.add_to_nfa(this); } } +template +auto Nfa::get_or_create_capture_tags(Capture const* capture +) -> std::pair { + auto const existing_tags{m_capture_to_tag_ids.find(capture)}; + if (m_capture_to_tag_ids.end() == existing_tags) { + auto start_tag{m_unique_id_generator.generate_id()}; + auto end_tag{m_unique_id_generator.generate_id()}; + auto new_tags{std::make_pair(start_tag, end_tag)}; + m_capture_to_tag_ids.emplace(capture, new_tags); + return new_tags; + } + return existing_tags->second; +} + template auto Nfa::new_state() -> TypedNfaState* { m_states.emplace_back(std::make_unique()); @@ -111,31 +146,38 @@ auto Nfa::new_state() -> TypedNfaState* { template auto Nfa::new_state_with_positive_tagged_end_transition( - Tag const* tag, + tag_id_t const tag_id, TypedNfaState const* dest_state ) -> TypedNfaState* { - m_states.emplace_back(std::make_unique(tag, dest_state)); + m_states.emplace_back(std::make_unique(tag_id, dest_state)); return m_states.back().get(); } template auto Nfa::new_state_with_negative_tagged_transition( - std::vector tags, + std::vector const& captures, TypedNfaState const* dest_state ) -> TypedNfaState* { + std::vector tags; + for (auto const capture : captures) { + auto [start_tag, end_tag]{get_or_create_capture_tags(capture)}; + tags.push_back(start_tag); + tags.push_back(end_tag); + } + m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); return m_states.back().get(); } template auto Nfa::new_start_and_end_states_with_positive_tagged_transitions( - Tag const* tag, + Capture const* capture, TypedNfaState const* dest_state ) -> std::pair { + auto [start_tag, end_tag]{get_or_create_capture_tags(capture)}; auto* start_state = new_state(); - m_root->add_positive_tagged_start_transition(tag, start_state); - - auto* end_state = new_state_with_positive_tagged_end_transition(tag, dest_state); + m_root->add_positive_tagged_start_transition(start_tag, start_state); + auto* end_state{new_state_with_positive_tagged_end_transition(end_tag, dest_state)}; return {start_state, end_state}; } diff --git a/src/log_surgeon/finite_automata/NfaState.hpp b/src/log_surgeon/finite_automata/NfaState.hpp index 590c1607..a3b46ba0 100644 --- a/src/log_surgeon/finite_automata/NfaState.hpp +++ b/src/log_surgeon/finite_automata/NfaState.hpp @@ -18,6 +18,7 @@ #include namespace log_surgeon::finite_automata { + template class NfaState; @@ -31,11 +32,12 @@ class NfaState { NfaState() = default; - NfaState(Tag const* tag, NfaState const* dest_state) - : m_positive_tagged_end_transition{PositiveTaggedTransition{tag, dest_state}} {} + NfaState(tag_id_t tag_id, NfaState const* dest_state) + : m_positive_tagged_end_transition{PositiveTaggedTransition{tag_id, dest_state}} {} - NfaState(std::vector tags, NfaState const* dest_state) - : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tags), dest_state}} {} + NfaState(std::vector tag_ids, NfaState const* dest_state) + : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tag_ids), dest_state} + } {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } @@ -49,8 +51,9 @@ class NfaState { return m_matching_variable_id; } - auto add_positive_tagged_start_transition(Tag const* tag, NfaState const* dest_state) -> void { - m_positive_tagged_start_transitions.emplace_back(tag, dest_state); + auto add_positive_tagged_start_transition(tag_id_t const tag_id, NfaState const* dest_state) + -> void { + m_positive_tagged_start_transitions.emplace_back(tag_id, dest_state); } [[nodiscard]] auto get_positive_tagged_start_transitions( diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index e2de78aa..fc047d2a 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -12,11 +12,12 @@ namespace log_surgeon::finite_automata { * Represents a prefix tree to store register data during TDFA simulation. Each node in the tree * stores a single position in the lexed string. Each path from the root to an index corresponds to * a sequence of positions for an individual tag: - * - Positive position node: Indicates the tag was matched at the position. - * - Negative position node: Indicates the tag was unmatched. If a negative node is the entire path, - * it indicates the tag was never matched. If the negative tag is along a path containing positive - * nodes, it functions as a placeholder. This can be useful for nested capture groups, to maintain - * a one-to-one mapping between the contained capture group and the enclosing capture group. + * - Positive position node: Indicates the capture was matched at the position. + * - Negative position node: Indicates the capture was unmatched. If a negative node is the entire + * path, it indicates the tag was never matched. If the negative tag is along a path containing + * positive nodes, it functions as a placeholder. This can be useful for nested capture groups, to + * maintain a one-to-one mapping between the contained capture group and the enclosing capture + * group. */ class PrefixTree { public: diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index bb55f62d..eada30b1 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include namespace log_surgeon::finite_automata { @@ -30,12 +30,12 @@ class Nfa; // TODO: rename `RegexAST` to `RegexASTNode` /** * Base class for a Regex AST node. - * Unique integer tags are used to differentiate each capture group node. Every node will maintain - * two sets of tags: - * 1. `m_subtree_positive_tags`: the set of tags matched by all capture groups within the subtree - * rooted at this node. - * 2. `m_negative_tags`: the set of tags that are guaranteed to be unmatched when traversing this - * node, as the alternative path contains these tags. + * Unique capture pointers are used to differentiate each capture group node. Every node will + * maintain two sets of captures: + * 1. `m_subtree_positive_captures`: the set of captures matched by all capture groups within the + * subtree rooted at this node. + * 2. `m_negative_captures`: the set of captures that are guaranteed to be unmatched when traversing + * this node, as the alternative path contains these captures. * * ASTs built using this class are assumed to be constructed in a bottom-up manner, where all * descendant nodes are created first. @@ -83,24 +83,26 @@ class RegexAST { */ [[nodiscard]] virtual auto serialize() const -> std::u32string = 0; - [[nodiscard]] auto get_subtree_positive_tags() const -> std::vector const& { - return m_subtree_positive_tags; + [[nodiscard]] auto get_subtree_positive_captures() const -> std::vector const& { + return m_subtree_positive_captures; } - auto set_subtree_positive_tags(std::vector subtree_positive_tags) -> void { - m_subtree_positive_tags = std::move(subtree_positive_tags); + auto set_subtree_positive_captures(std::vector subtree_positive_captures + ) -> void { + m_subtree_positive_captures = std::move(subtree_positive_captures); } - auto add_subtree_positive_tags(std::vector const& subtree_positive_tags) -> void { - m_subtree_positive_tags.insert( - m_subtree_positive_tags.end(), - subtree_positive_tags.cbegin(), - subtree_positive_tags.cend() + auto add_subtree_positive_captures(std::vector const& subtree_positive_captures + ) -> void { + m_subtree_positive_captures.insert( + m_subtree_positive_captures.end(), + subtree_positive_captures.cbegin(), + subtree_positive_captures.cend() ); } - auto set_negative_tags(std::vector negative_tags) -> void { - m_negative_tags = std::move(negative_tags); + auto set_negative_captures(std::vector negative_captures) -> void { + m_negative_captures = std::move(negative_captures); } /** @@ -110,11 +112,13 @@ class RegexAST { */ auto add_to_nfa_with_negative_tags(Nfa* nfa, TypedNfaState* end_state) const -> void { - // Handle negative tags as: - // root --(regex)--> state_with_negative_tagged_transition --(negative tags)--> end_state - if (false == m_negative_tags.empty()) { - auto* state_with_negative_tagged_transition - = nfa->new_state_with_negative_tagged_transition(m_negative_tags, end_state); + // Handle negative captures as: + // root --(regex)--> state_with_negative_tagged_transition --(negative captures)--> + // end_state + if (false == m_negative_captures.empty()) { + auto* state_with_negative_tagged_transition{ + nfa->new_state_with_negative_tagged_transition(m_negative_captures, end_state) + }; add_to_nfa(nfa, state_with_negative_tagged_transition); } else { add_to_nfa(nfa, end_state); @@ -127,27 +131,30 @@ class RegexAST { RegexAST(RegexAST&& rhs) noexcept = delete; auto operator=(RegexAST&& rhs) noexcept -> RegexAST& = delete; - [[nodiscard]] auto serialize_negative_tags() const -> std::u32string { - if (m_negative_tags.empty()) { + [[nodiscard]] auto serialize_negative_captures() const -> std::u32string { + if (m_negative_captures.empty()) { return U""; } - auto const transformed_negative_tags - = m_negative_tags | std::ranges::views::transform([](Tag const* tag) { - return fmt::format("<~{}>", tag->get_name()); - }); - auto const negative_tags_string - = fmt::format("{}", fmt::join(transformed_negative_tags, "")); + auto const transformed_negative_captures{ + m_negative_captures | std::ranges::views::transform([](Capture const* capture) { + return fmt::format("<~{}>", capture->get_name()); + }) + }; + auto const negative_captures_string{ + fmt::format("{}", fmt::join(transformed_negative_captures, "")) + }; return fmt::format( U"{}", - std::u32string(negative_tags_string.begin(), negative_tags_string.end()) + std::u32string(negative_captures_string.begin(), negative_captures_string.end()) ); } private: - std::vector m_subtree_positive_tags; - std::vector m_negative_tags; + std::vector m_subtree_capture_ids; + std::vector m_subtree_positive_captures; + std::vector m_negative_captures; }; /** @@ -624,7 +631,7 @@ class RegexASTMultiplication : public RegexAST { /** * Represents a capture group AST node. * NOTE: - * - `m_tag` is always expected to be non-null. + * - `m_capture` is always expected to be non-null. * - `m_group_regex_ast` is always expected to be non-null. * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ @@ -635,24 +642,26 @@ class RegexASTCapture : public RegexAST { /** * @param group_regex_ast - * @param tag - * @throw std::invalid_argument if `group_regex_ast` or `tag` are `nullptr`. + * @param capture + * @throw std::invalid_argument if `group_regex_ast` or `capture` are `nullptr`. */ RegexASTCapture( std::unique_ptr> group_regex_ast, - std::unique_ptr tag + std::unique_ptr capture ) : m_group_regex_ast{( nullptr == group_regex_ast ? throw std::invalid_argument("Group regex AST cannot be null") : std::move(group_regex_ast) )}, - m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") - : std::move(tag)} { - RegexAST::set_subtree_positive_tags( - m_group_regex_ast->get_subtree_positive_tags() + m_capture{ + nullptr == capture ? throw std::invalid_argument("Capture cannot be null") + : std::move(capture) + } { + RegexAST::set_subtree_positive_captures( + m_group_regex_ast->get_subtree_positive_captures() ); - RegexAST::add_subtree_positive_tags({m_tag.get()}); + RegexAST::add_subtree_positive_captures({m_capture.get()}); } RegexASTCapture(RegexASTCapture const& rhs) @@ -660,8 +669,8 @@ class RegexASTCapture : public RegexAST { m_group_regex_ast{ std::unique_ptr>(rhs.m_group_regex_ast->clone()) }, - m_tag{std::make_unique(*rhs.m_tag)} { - RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); + m_capture{std::make_unique(*rhs.m_capture)} { + RegexAST::set_subtree_positive_captures(rhs.get_subtree_positive_captures()); } /** @@ -701,7 +710,7 @@ class RegexASTCapture : public RegexAST { [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto get_group_name() const -> std::string_view { return m_tag->get_name(); } + [[nodiscard]] auto get_group_name() const -> std::string_view { return m_capture->get_name(); } [[nodiscard]] auto get_group_regex_ast( ) const -> std::unique_ptr> const& { @@ -710,12 +719,12 @@ class RegexASTCapture : public RegexAST { private: std::unique_ptr> m_group_regex_ast; - std::unique_ptr m_tag; + std::unique_ptr m_capture; }; template [[nodiscard]] auto RegexASTEmpty::serialize() const -> std::u32string { - return fmt::format(U"{}", RegexAST::serialize_negative_tags()); + return fmt::format(U"{}", RegexAST::serialize_negative_captures()); } template @@ -732,7 +741,7 @@ template return fmt::format( U"{}{}", static_cast(m_character), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } @@ -763,7 +772,7 @@ template return fmt::format( U"{}{}", std::u32string(digits_string.begin(), digits_string.end()), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } @@ -774,10 +783,11 @@ RegexASTOr::RegexASTOr( ) : m_left(std::move(left)), m_right(std::move(right)) { - m_left->set_negative_tags(m_right->get_subtree_positive_tags()); - m_right->set_negative_tags(m_left->get_subtree_positive_tags()); - RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); - RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); + m_left->set_negative_captures(m_right->get_subtree_positive_captures()); + m_right->set_negative_captures(m_left->get_subtree_positive_captures()); + RegexAST::set_subtree_positive_captures(m_left->get_subtree_positive_captures()); + RegexAST::add_subtree_positive_captures(m_right->get_subtree_positive_captures() + ); } template @@ -793,7 +803,7 @@ template U"({})|({}){}", nullptr != m_left ? m_left->serialize() : U"null", nullptr != m_right ? m_right->serialize() : U"null", - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } @@ -804,8 +814,9 @@ RegexASTCat::RegexASTCat( ) : m_left(std::move(left)), m_right(std::move(right)) { - RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); - RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_captures(m_left->get_subtree_positive_captures()); + RegexAST::add_subtree_positive_captures(m_right->get_subtree_positive_captures() + ); } template @@ -825,7 +836,7 @@ template U"{}{}{}", nullptr != m_left ? m_left->serialize() : U"null", nullptr != m_right ? m_right->serialize() : U"null", - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } @@ -838,7 +849,8 @@ RegexASTMultiplication::RegexASTMultiplication( : m_operand(std::move(operand)), m_min(min), m_max(max) { - RegexAST::set_subtree_positive_tags(m_operand->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_captures(m_operand->get_subtree_positive_captures( + )); } template @@ -887,7 +899,7 @@ template nullptr != m_operand ? m_operand->serialize() : U"null", std::u32string(min_string.begin(), min_string.end()), is_infinite() ? U"inf" : std::u32string(max_string.begin(), max_string.end()), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } @@ -900,7 +912,7 @@ auto RegexASTCapture::add_to_nfa(Nfa* nfa, TypedNf // +---------------------+ // | `m_root` | // +---------------------+ - // | `m_tag` start + // | `m_capture` start // | (positive tagged start transition) // v // +---------------------+ @@ -913,13 +925,13 @@ auto RegexASTCapture::add_to_nfa(Nfa* nfa, TypedNf // | `m_group_regex_ast` | // | (nested NFA) | // +---------------------+ - // | `m_negative_tags` + // | `m_negative_captures` // | (negative tagged transition) // v // +---------------------+ // | `capture_end_state` | // +---------------------+ - // | `m_tag` end + // | `m_capture` end // | (positive tagged end transition) // v // +---------------------+ @@ -927,7 +939,7 @@ auto RegexASTCapture::add_to_nfa(Nfa* nfa, TypedNf // +---------------------+ auto [capture_start_state, capture_end_state] = nfa->new_start_and_end_states_with_positive_tagged_transitions( - m_tag.get(), + m_capture.get(), dest_state ); @@ -939,12 +951,14 @@ auto RegexASTCapture::add_to_nfa(Nfa* nfa, TypedNf template [[nodiscard]] auto RegexASTCapture::serialize() const -> std::u32string { - auto const tag_name_u32 = std::u32string(m_tag->get_name().cbegin(), m_tag->get_name().cend()); + auto const capture_name_u32{ + std::u32string(m_capture->get_name().cbegin(), m_capture->get_name().cend()) + }; return fmt::format( U"({})<{}>{}", m_group_regex_ast->serialize(), - tag_name_u32, - RegexAST::serialize_negative_tags() + capture_name_u32, + RegexAST::serialize_negative_captures() ); } @@ -1100,7 +1114,7 @@ template U"[{}{}]{}", m_negate ? U"^" : U"", ranges_serialized, - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } } // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp index d61240e3..be56cb47 100644 --- a/src/log_surgeon/finite_automata/RegisterHandler.hpp +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -2,6 +2,7 @@ #define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP #include +#include #include #include @@ -17,6 +18,8 @@ namespace log_surgeon::finite_automata { */ class RegisterHandler { public: + using register_id_t = uint32_t; + auto add_register( PrefixTree::id_t const prefix_tree_parent_node_id, PrefixTree::position_t const position diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 43315b2a..e2e44d36 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -10,24 +10,18 @@ #include -#include - namespace log_surgeon::finite_automata { +using tag_id_t = std::uint32_t; + /** - * Represents an NFA transition indicating that a capture group has been matched. - * NOTE: `m_tag` is always expected to be non-null. + * Represents an NFA transition indicating that a tag has been matched. * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ template class PositiveTaggedTransition { public: - /** - * @param tag - * @param dest_state - * @throw std::invalid_argument if `tag` is `nullptr`. - */ - PositiveTaggedTransition(Tag const* tag, TypedNfaState const* dest_state) - : m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : tag}, + PositiveTaggedTransition(tag_id_t const tag_id, TypedNfaState const* dest_state) + : m_tag_id{tag_id}, m_dest_state{dest_state} {} [[nodiscard]] auto get_dest_state() const -> TypedNfaState const* { return m_dest_state; } @@ -43,34 +37,23 @@ class PositiveTaggedTransition { if (state_id_it == state_ids.end()) { return std::nullopt; } - return fmt::format("{}[{}]", state_id_it->second, m_tag->get_name()); + return fmt::format("{}[{}]", state_id_it->second, m_tag_id); } private: - Tag const* m_tag; + tag_id_t m_tag_id; TypedNfaState const* m_dest_state; }; /** - * Represents an NFA transition indicating that a capture group has been unmatched. - * NOTE: All tags in `m_tags` are always expected to be non-null. + * Represents an NFA transition indicating that a tag has been unmatched. * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ template class NegativeTaggedTransition { public: - /** - * @param tags - * @param dest_state - * @throw std::invalid_argument if any elements in `tags` is `nullptr`. - */ - NegativeTaggedTransition(std::vector tags, TypedNfaState const* dest_state) - : m_tags{[&tags] { - if (std::ranges::any_of(tags, [](Tag const* tag) { return nullptr == tag; })) { - throw std::invalid_argument("Tags cannot contain null elements"); - } - return std::move(tags); - }()}, + NegativeTaggedTransition(std::vector tag_ids, TypedNfaState const* dest_state) + : m_tag_ids{std::move(tag_ids)}, m_dest_state{dest_state} {} [[nodiscard]] auto get_dest_state() const -> TypedNfaState const* { return m_dest_state; } @@ -86,14 +69,11 @@ class NegativeTaggedTransition { if (state_id_it == state_ids.end()) { return std::nullopt; } - - auto const tag_names = m_tags | std::ranges::views::transform(&Tag::get_name); - - return fmt::format("{}[{}]", state_id_it->second, fmt::join(tag_names, ",")); + return fmt::format("{}[{}]", state_id_it->second, fmt::join(m_tag_ids, ",")); } private: - std::vector m_tags; + std::vector m_tag_ids; TypedNfaState const* m_dest_state; }; } // namespace log_surgeon::finite_automata diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 652ecebc..edf87095 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,14 +2,14 @@ set( SOURCES_LOG_SURGEON ../src/log_surgeon/FileReader.cpp ../src/log_surgeon/FileReader.hpp + ../src/log_surgeon/finite_automata/Capture.hpp + ../src/log_surgeon/finite_automata/Nfa.hpp + ../src/log_surgeon/finite_automata/NfaState.hpp ../src/log_surgeon/finite_automata/PrefixTree.cpp ../src/log_surgeon/finite_automata/PrefixTree.hpp ../src/log_surgeon/finite_automata/RegexAST.hpp - ../src/log_surgeon/finite_automata/Nfa.hpp - ../src/log_surgeon/finite_automata/NfaState.hpp ../src/log_surgeon/finite_automata/RegisterHandler.hpp ../src/log_surgeon/finite_automata/StateType.hpp - ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp ../src/log_surgeon/Lalr1Parser.cpp ../src/log_surgeon/Lalr1Parser.hpp @@ -22,9 +22,17 @@ set( ../src/log_surgeon/SchemaParser.hpp ../src/log_surgeon/Token.cpp ../src/log_surgeon/Token.hpp + ../src/log_surgeon/UniqueIdGenerator.hpp ) -set(SOURCES_TESTS test-lexer.cpp test-nfa.cpp test-prefix-tree.cpp test-register-handler.cpp test-tag.cpp) +set( + SOURCES_TESTS + test-lexer.cpp + test-nfa.cpp + test-prefix-tree.cpp + test-register-handler.cpp + test-capture.cpp +) add_executable(unit-test ${SOURCES_LOG_SURGEON} ${SOURCES_TESTS}) target_link_libraries(unit-test PRIVATE Catch2::Catch2WithMain log_surgeon::log_surgeon) diff --git a/tests/test-capture.cpp b/tests/test-capture.cpp new file mode 100644 index 00000000..28c2b2df --- /dev/null +++ b/tests/test-capture.cpp @@ -0,0 +1,34 @@ +#include + +#include + +using log_surgeon::finite_automata::Capture; + +TEST_CASE("Capture operations", "[Capture]") { + SECTION("Basic name retrieval works correctly") { + Capture const capture{"uID"}; + REQUIRE("uID" == capture.get_name()); + } + + SECTION("Empty capture name is handled correctly") { + Capture const empty_capture{""}; + REQUIRE(empty_capture.get_name().empty()); + } + + SECTION("Special characters in capture names are preserved") { + Capture const special_capture{"user.id-123_@"}; + REQUIRE("user.id-123_@" == special_capture.get_name()); + } + + SECTION("Copy constructor works correctly") { + Capture assign_capture{"target"}; + assign_capture = Capture{"new_source"}; + REQUIRE("new_source" == assign_capture.get_name()); + } + + SECTION("Move constructor works correctly") { + Capture original_capture{"source"}; + Capture moved_capture{std::move(original_capture)}; + REQUIRE("source" == moved_capture.get_name()); + } +} diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 48b2185c..d93a8ea0 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -6,12 +6,18 @@ #include +#include #include #include #include #include +using log_surgeon::lexers::ByteLexer; +using log_surgeon::Schema; +using log_surgeon::SchemaAST; +using log_surgeon::SymbolId; using std::codecvt_utf8; +using std::make_unique; using std::string; using std::string_view; using std::u32string; @@ -49,9 +55,25 @@ auto test_regex_ast(string_view var_schema, u32string const& expected_serialized */ [[nodiscard]] auto u32string_to_string(u32string const& u32_str) -> string; +/** + * Initiailizes the lexer with the constant delimiters and the given schema. + * @param schema Contains the variables to add to the lexer. + * @param lexer Returns the initiailzed parser. + */ +auto initialize_lexer(Schema schema, ByteLexer& lexer) -> void; + +/** + * Scans the given input to ensure the correct behavior. + * @param lexer The lexer to scan the input with. + * @param input The input to test. + * @param symbol The expected symbol to match. + */ +auto test_scanning_input(ByteLexer const& lexer, std::string_view input, std::string_view symbol) + -> void; + auto test_regex_ast(string_view const var_schema, u32string const& expected_serialized_ast) -> void { - log_surgeon::Schema schema; + Schema schema; schema.add_variable(var_schema, -1); auto const schema_ast = schema.release_schema_ast_ptr(); @@ -67,11 +89,73 @@ auto u32string_to_string(u32string const& u32_str) -> string { wstring_convert, char32_t> converter; return converter.to_bytes(u32_str.data(), u32_str.data() + u32_str.size()); } + +auto initialize_lexer(std::unique_ptr schema_ast, ByteLexer& lexer) -> void { + vector const cDelimiters{' ', '\n'}; + lexer.add_delimiters(cDelimiters); + + vector delimiters; + for (uint32_t i{0}; i < log_surgeon::cSizeOfByte; i++) { + if (lexer.is_delimiter(i)) { + delimiters.push_back(i); + } + } + + lexer.m_symbol_id[log_surgeon::cTokenEnd] = static_cast(SymbolId::TokenEnd); + lexer.m_symbol_id[log_surgeon::cTokenUncaughtString] + = static_cast(SymbolId::TokenUncaughtString); + lexer.m_id_symbol[static_cast(SymbolId::TokenEnd)] = log_surgeon::cTokenEnd; + lexer.m_id_symbol[static_cast(SymbolId::TokenUncaughtString)] + = log_surgeon::cTokenUncaughtString; + + for (auto const& m_schema_var : schema_ast->m_schema_vars) { + // For log-specific lexing: modify variable regex to contain a delimiter at the start. + auto delimiter_group{make_unique(RegexASTGroupByte(delimiters))}; + auto* rule{dynamic_cast(m_schema_var.get())}; + rule->m_regex_ptr = make_unique( + std::move(delimiter_group), + std::move(rule->m_regex_ptr) + ); + if (!lexer.m_symbol_id.contains(rule->m_name)) { + lexer.m_symbol_id[rule->m_name] = lexer.m_symbol_id.size(); + lexer.m_id_symbol[lexer.m_symbol_id[rule->m_name]] = rule->m_name; + } + lexer.add_rule(lexer.m_symbol_id[rule->m_name], std::move(rule->m_regex_ptr)); + } + lexer.generate(); +} + +auto test_scanning_input(ByteLexer& lexer, std::string_view input, std::string_view symbol) + -> void { + lexer.reset(); + + log_surgeon::ParserInputBuffer input_buffer; + string token_string{input}; + input_buffer.set_storage(token_string.data(), token_string.size(), 0, true); + lexer.prepend_start_of_file_char(input_buffer); + + log_surgeon::Token token; + auto error_code{lexer.scan(input_buffer, token)}; + REQUIRE(log_surgeon::ErrorCode::Success == error_code); + REQUIRE(nullptr != token.m_type_ids_ptr); + REQUIRE(1 == token.m_type_ids_ptr->size()); + REQUIRE(symbol == lexer.m_id_symbol[token.m_type_ids_ptr->at(0)]); + REQUIRE(input == token.to_string_view()); + + error_code = lexer.scan(input_buffer, token); + REQUIRE(log_surgeon::ErrorCode::Success == error_code); + REQUIRE(nullptr != token.m_type_ids_ptr); + REQUIRE(1 == token.m_type_ids_ptr->size()); + REQUIRE(log_surgeon::cTokenEnd == lexer.m_id_symbol[token.m_type_ids_ptr->at(0)]); + REQUIRE(token.to_string_view().empty()); + + // TODO: add check for register values when simulation is implemented. +} } // namespace TEST_CASE("Test the Schema class", "[Schema]") { SECTION("Add a number variable to schema") { - log_surgeon::Schema schema; + Schema schema; string const var_name = "myNumber"; string const var_schema = var_name + string(":") + string("123"); schema.add_variable(string_view(var_schema), -1); @@ -89,7 +173,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { } SECTION("Add a capture variable to schema") { - log_surgeon::Schema schema; + Schema schema; std::string const var_name = "capture"; string const var_schema = var_name + string(":") + string("u(?[0-9]+)"); schema.add_variable(var_schema, -1); @@ -208,3 +292,59 @@ TEST_CASE("Test the Schema class", "[Schema]") { ); } } + +TEST_CASE("Test basic Lexer", "[Lexer]") { + constexpr string_view cVarName{"myVar"}; + constexpr string_view cVarSchema{"myVar:123"}; + constexpr string_view cTokenString1{"123"}; + constexpr string_view cTokenString2{"234"}; + + Schema schema; + schema.add_variable(cVarSchema, -1); + + ByteLexer lexer; + initialize_lexer(std::move(schema.release_schema_ast_ptr()), lexer); + + test_scanning_input(lexer, cTokenString1, cVarName); + test_scanning_input(lexer, cTokenString2, log_surgeon::cTokenUncaughtString); +} + +TEST_CASE("Test Lexer with capture groups", "[Lexer]") { + vector const cDelimiters{' ', '\n'}; + constexpr string_view cVarName{"myVar"}; + constexpr string_view cCaptureName{"uid"}; + constexpr string_view cVarSchema{"myVar:userID=(?123)"}; + constexpr string_view cTokenString1{"userID=123"}; + constexpr string_view cTokenString2{"userID=234"}; + constexpr string_view cTokenString3{"123"}; + + Schema schema; + schema.add_variable(cVarSchema, -1); + + ByteLexer lexer; + initialize_lexer(std::move(schema.release_schema_ast_ptr()), lexer); + + string varName{cVarName}; + auto const var_id{lexer.m_symbol_id.find(varName)}; + REQUIRE(lexer.m_symbol_id.end() != var_id); + + string captureName{cCaptureName}; + auto const capture_id{lexer.m_symbol_id.find(captureName)}; + REQUIRE(lexer.m_symbol_id.end() != capture_id); + + auto capture_ids{lexer.get_capture_ids_for_var_id(var_id->second)}; + REQUIRE(capture_ids.has_value()); + REQUIRE(1 == capture_ids.value().size()); + REQUIRE(capture_id->second == capture_ids.value()[0]); + + auto tag_ids{lexer.get_tag_ids_for_capture_id(capture_ids.value()[0])}; + REQUIRE(tag_ids.has_value()); + REQUIRE(std::make_pair(0u, 1u) == tag_ids.value()); + + // TODO: add check for get_register_for_tag_id and get_registers_for_capture when + // determinization is implemented. + + test_scanning_input(lexer, cTokenString1, cVarName); + test_scanning_input(lexer, cTokenString2, log_surgeon::cTokenUncaughtString); + test_scanning_input(lexer, cTokenString3, log_surgeon::cTokenUncaughtString); +} diff --git a/tests/test-nfa.cpp b/tests/test-nfa.cpp index 719a168e..2c4e3477 100644 --- a/tests/test-nfa.cpp +++ b/tests/test-nfa.cpp @@ -44,9 +44,10 @@ TEST_CASE("Test NFA", "[NFA]") { auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); vector rules; rules.emplace_back(0, std::move(capture_rule_ast.m_regex_ptr)); - ByteNfa const nfa{std::move(rules)}; + ByteNfa const nfa{rules}; // Compare against expected output + // capture order(tags in brackets): letter1(0,1), letter2(2,3), letter(4,5), containerID(6,7) string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," @@ -54,18 +55,17 @@ TEST_CASE("Test NFA", "[NFA]") { "negative_tagged_transition={}\n"; expected_serialized_nfa += "1:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_start_transitions={3[letter]}," + "positive_tagged_start_transitions={3[4]}," "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa - += "2:byte_transitions={}," - "epsilon_transitions={}," - "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={}," - "negative_tagged_transition={4[letter1,letter2,letter,containerID]}\n"; + expected_serialized_nfa += "2:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," + "negative_tagged_transition={4[0,1,2,3,4,5,6,7]}\n"; expected_serialized_nfa += "3:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_start_transitions={5[letter1],6[letter2]}," + "positive_tagged_start_transitions={5[0],6[2]}," "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "4:accepting_tag=0,byte_transitions={}," @@ -86,27 +86,27 @@ TEST_CASE("Test NFA", "[NFA]") { expected_serialized_nfa += "7:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={9[letter1]}," + "positive_tagged_end_transitions={9[1]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "8:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={10[letter2]}," + "positive_tagged_end_transitions={10[3]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "9:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," - "negative_tagged_transition={11[letter2]}\n"; + "negative_tagged_transition={11[2,3]}\n"; expected_serialized_nfa += "10:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," - "negative_tagged_transition={11[letter1]}\n"; + "negative_tagged_transition={11[0,1]}\n"; expected_serialized_nfa += "11:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={12[letter]}," + "positive_tagged_end_transitions={12[5]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "12:byte_transitions={B-->13}," "epsilon_transitions={}," @@ -115,7 +115,7 @@ TEST_CASE("Test NFA", "[NFA]") { "negative_tagged_transition={}\n"; expected_serialized_nfa += "13:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_start_transitions={14[containerID]}," + "positive_tagged_start_transitions={14[6]}," "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "14:byte_transitions={0-->15,1-->15,2-->15,3-->15,4-->15,5-->15,6-->" @@ -128,7 +128,7 @@ TEST_CASE("Test NFA", "[NFA]") { "15,7-->15,8-->15,9-->15}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={16[containerID]}," + "positive_tagged_end_transitions={16[7]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "16:byte_transitions={C-->4}," "epsilon_transitions={}," diff --git a/tests/test-tag.cpp b/tests/test-tag.cpp deleted file mode 100644 index 41f8a2ef..00000000 --- a/tests/test-tag.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include - -#include - -using log_surgeon::finite_automata::Tag; - -TEST_CASE("Tag operations", "[Tag]") { - SECTION("Basic name retrieval works correctly") { - Tag const tag{"uID"}; - REQUIRE("uID" == tag.get_name()); - } - - SECTION("Empty tag name is handled correctly") { - Tag const empty_tag{""}; - REQUIRE(empty_tag.get_name().empty()); - } - - SECTION("Special characters in tag names are preserved") { - Tag const special_tag{"user.id-123_@"}; - REQUIRE("user.id-123_@" == special_tag.get_name()); - } - - SECTION("Copy constructor works correctly") { - Tag assign_tag{"target"}; - assign_tag = Tag{"new_source"}; - REQUIRE("new_source" == assign_tag.get_name()); - } - - SECTION("Move constructor works correctly") { - Tag original_tag{"source"}; - Tag moved_tag{std::move(original_tag)}; - REQUIRE("source" == moved_tag.get_name()); - } -}