diff --git a/CMakeLists.txt b/CMakeLists.txt index 2627928e..ceb932dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,9 +68,9 @@ set(SOURCE_FILES src/log_surgeon/Constants.hpp src/log_surgeon/FileReader.cpp src/log_surgeon/FileReader.hpp - src/log_surgeon/LALR1Parser.cpp - src/log_surgeon/LALR1Parser.hpp - src/log_surgeon/LALR1Parser.tpp + src/log_surgeon/Lalr1Parser.cpp + src/log_surgeon/Lalr1Parser.hpp + src/log_surgeon/Lalr1Parser.tpp src/log_surgeon/Lexer.hpp src/log_surgeon/Lexer.tpp src/log_surgeon/LexicalRule.hpp diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index 4930d702..4c3bff89 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -5,10 +5,10 @@ #include #include +using log_surgeon::finite_automata::ByteDfaState; +using log_surgeon::finite_automata::ByteNfaState; using log_surgeon::finite_automata::Dfa; -using log_surgeon::finite_automata::DfaByteState; using log_surgeon::finite_automata::Nfa; -using log_surgeon::finite_automata::NfaByteState; using log_surgeon::lexers::ByteLexer; using log_surgeon::LexicalRule; using log_surgeon::ParserAST; @@ -17,11 +17,11 @@ using std::string; using std::unique_ptr; using std::vector; -using ByteLexicalRule = log_surgeon::LexicalRule; +using ByteLexicalRule = log_surgeon::LexicalRule; auto get_intersect_for_query( std::map& m_id_symbol, - std::unique_ptr>& dfa1, + std::unique_ptr>& dfa1, std::string const& search_string ) -> void { std::string processed_search_string; @@ -40,9 +40,9 @@ auto get_intersect_for_query( auto* schema_var_ast = dynamic_cast(parser_ast.get()); rules.emplace_back(0, std::move(schema_var_ast->m_regex_ptr)); } - Nfa nfa(std::move(rules)); + Nfa nfa(std::move(rules)); auto dfa2 = ByteLexer::nfa_to_dfa(nfa); - auto schema_types = dfa1->get_intersect(dfa2); + auto schema_types = dfa1->get_intersect(dfa2.get()); std::cout << search_string << ":"; for (auto const& schema_type : schema_types) { std::cout << m_id_symbol[schema_type] << ","; @@ -78,7 +78,7 @@ auto main() -> int { rules.emplace_back(m_id_symbol.size(), std::move(var_ast->m_regex_ptr)); m_id_symbol[m_id_symbol.size()] = var_ast->m_name; } - Nfa nfa(std::move(rules)); + Nfa nfa(std::move(rules)); auto dfa = ByteLexer::nfa_to_dfa(nfa); get_intersect_for_query(m_id_symbol, dfa, "*1*"); get_intersect_for_query(m_id_symbol, dfa, "*a*"); diff --git a/src/log_surgeon/BufferParser.hpp b/src/log_surgeon/BufferParser.hpp index 75eb41a4..4a1a8e78 100644 --- a/src/log_surgeon/BufferParser.hpp +++ b/src/log_surgeon/BufferParser.hpp @@ -20,7 +20,7 @@ class BufferParser { /** * Constructs the parser using the given schema file. * @param schema_file_path - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure parsing the schema file or processing the schema * AST. */ @@ -29,7 +29,7 @@ class BufferParser { /** * Constructs the parser using the given schema AST. * @param schema_ast - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure processing the schema AST. */ explicit BufferParser(std::unique_ptr schema_ast); diff --git a/src/log_surgeon/LALR1Parser.cpp b/src/log_surgeon/Lalr1Parser.cpp similarity index 93% rename from src/log_surgeon/LALR1Parser.cpp rename to src/log_surgeon/Lalr1Parser.cpp index d1c4b7c0..a7f54a63 100644 --- a/src/log_surgeon/LALR1Parser.cpp +++ b/src/log_surgeon/Lalr1Parser.cpp @@ -1,4 +1,4 @@ -#include "LALR1Parser.hpp" +#include "Lalr1Parser.hpp" namespace log_surgeon { MatchedSymbol NonTerminal::m_all_children[cSizeOfAllChildren]; diff --git a/src/log_surgeon/LALR1Parser.hpp b/src/log_surgeon/Lalr1Parser.hpp similarity index 96% rename from src/log_surgeon/LALR1Parser.hpp rename to src/log_surgeon/Lalr1Parser.hpp index 6417c3c6..f3c6ee65 100644 --- a/src/log_surgeon/LALR1Parser.hpp +++ b/src/log_surgeon/Lalr1Parser.hpp @@ -200,10 +200,10 @@ struct ItemSet { std::vector m_actions; }; -template -class LALR1Parser : public Parser { +template +class Lalr1Parser : public Parser { public: - LALR1Parser(); + Lalr1Parser(); /** * Add a lexical rule to m_lexer @@ -212,7 +212,7 @@ class LALR1Parser : public Parser { */ auto add_rule( std::string const& name, - std::unique_ptr> rule + std::unique_ptr> rule ) -> void override; /** @@ -222,7 +222,7 @@ class LALR1Parser : public Parser { */ auto add_token_group( std::string const& name, - std::unique_ptr> rule_group + std::unique_ptr> rule_group ) -> void; /** @@ -274,7 +274,7 @@ class LALR1Parser : public Parser { */ auto report_error() -> std::string; - /* Lexer m_lexer; */ + /* Lexer m_lexer; */ std::stack m_parse_stack_matches; std::stack m_parse_stack_states; ItemSet* m_root_item_set_ptr{nullptr}; @@ -405,6 +405,6 @@ class LALR1Parser : public Parser { }; } // namespace log_surgeon -#include "LALR1Parser.tpp" +#include "Lalr1Parser.tpp" #endif // LOG_SURGEON_LALR1_PARSER_HPP diff --git a/src/log_surgeon/LALR1Parser.tpp b/src/log_surgeon/Lalr1Parser.tpp similarity index 87% rename from src/log_surgeon/LALR1Parser.tpp rename to src/log_surgeon/Lalr1Parser.tpp index 5542a05e..dd00d17d 100644 --- a/src/log_surgeon/LALR1Parser.tpp +++ b/src/log_surgeon/Lalr1Parser.tpp @@ -53,8 +53,8 @@ namespace { } } // namespace -template -LALR1Parser::LALR1Parser() { +template +Lalr1Parser::Lalr1Parser() { m_terminals.insert((uint32_t)SymbolId::TokenEnd); m_terminals.insert((uint32_t)SymbolId::TokenUncaughtString); m_terminals.insert((uint32_t)SymbolId::TokenInt); @@ -65,43 +65,43 @@ LALR1Parser::LALR1Parser() { m_terminals.insert((uint32_t)SymbolId::TokenNewline); } -template -void LALR1Parser::add_rule( +template +void Lalr1Parser::add_rule( std::string const& name, - std::unique_ptr> rule + std::unique_ptr> rule ) { - Parser::add_rule(name, std::move(rule)); + Parser::add_rule(name, std::move(rule)); m_terminals.insert(this->m_lexer.m_symbol_id[name]); } -template -void LALR1Parser::add_token_group( +template +void Lalr1Parser::add_token_group( std::string const& name, - std::unique_ptr> rule_group + std::unique_ptr> rule_group ) { add_rule(name, std::move(rule_group)); } -template -void LALR1Parser::add_token_chain( +template +void Lalr1Parser::add_token_chain( std::string const& name, std::string const& chain ) { assert(chain.size() > 1); - std::unique_ptr> first_char_rule - = std::make_unique>(chain[0]); - std::unique_ptr> second_char_rule - = std::make_unique>(chain[1]); - std::unique_ptr> rule_chain - = std::make_unique>( + std::unique_ptr> first_char_rule + = std::make_unique>(chain[0]); + std::unique_ptr> second_char_rule + = std::make_unique>(chain[1]); + std::unique_ptr> rule_chain + = std::make_unique>( std::move(first_char_rule), std::move(second_char_rule) ); for (uint32_t i = 2; i < chain.size(); i++) { char next_char = chain[i]; - std::unique_ptr> next_char_rule - = std::make_unique>(next_char); - rule_chain = std::make_unique>( + std::unique_ptr> next_char_rule + = std::make_unique>(next_char); + rule_chain = std::make_unique>( std::move(rule_chain), std::move(next_char_rule) ); @@ -109,8 +109,8 @@ void LALR1Parser::add_token_chain( add_rule(name, std::move(rule_chain)); } -template -auto LALR1Parser::add_production( +template +auto Lalr1Parser::add_production( std::string const& head, std::vector const& body, SemanticRule semantic_rule @@ -150,8 +150,8 @@ auto LALR1Parser::add_production( return n; } -template -void LALR1Parser::generate() { +template +void Lalr1Parser::generate() { this->m_lexer.generate(); assert(!m_productions.empty()); generate_lr0_kernels(); @@ -160,8 +160,8 @@ void LALR1Parser::generate() { generate_lalr1_parsing_table(); } -template -void LALR1Parser::generate_lr0_kernels() { +template +void Lalr1Parser::generate_lr0_kernels() { Production* root_production_ptr = m_productions[m_root_production_id].get(); Item root_item(root_production_ptr, 0, cNullSymbol); std::unique_ptr item_set0 = std::make_unique(); @@ -190,8 +190,8 @@ void LALR1Parser::generate_lr0_kernels() { } } -template -auto LALR1Parser::lr_closure_helper( +template +auto Lalr1Parser::lr_closure_helper( ItemSet* item_set_ptr, Item const* item, uint32_t* next_symbol @@ -210,8 +210,8 @@ auto LALR1Parser::lr_closure_helper( return false; } -template -void LALR1Parser::generate_lr0_closure(ItemSet* item_set_ptr) { +template +void Lalr1Parser::generate_lr0_closure(ItemSet* item_set_ptr) { std::deque q( item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end() @@ -233,8 +233,8 @@ void LALR1Parser::generate_lr0_closure(ItemSet* item } } -template -auto LALR1Parser::go_to( +template +auto Lalr1Parser::go_to( ItemSet* from_item_set, uint32_t const& next_symbol ) -> ItemSet* { @@ -266,8 +266,8 @@ auto LALR1Parser::go_to( return nullptr; } -template -void LALR1Parser::generate_first_sets() { +template +void Lalr1Parser::generate_first_sets() { for (uint32_t const& s : m_terminals) { m_firsts.insert(std::pair>(s, {s})); } @@ -298,8 +298,8 @@ void LALR1Parser::generate_first_sets() { } } -template -void LALR1Parser::generate_lr1_item_sets() { +template +void Lalr1Parser::generate_lr1_item_sets() { for (std::map, std::unique_ptr>::value_type const& kv : m_lr0_item_sets) { for (Item const& l0_item : kv.second->m_kernel) { @@ -382,8 +382,8 @@ void LALR1Parser::generate_lr1_item_sets() { } } -template -void LALR1Parser::generate_lr1_closure(ItemSet* item_set_ptr) { +template +void Lalr1Parser::generate_lr1_closure(ItemSet* item_set_ptr) { std::deque queue(item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end()); while (!queue.empty()) { Item item = queue.back(); @@ -418,20 +418,20 @@ void LALR1Parser::generate_lr1_closure(ItemSet* item } } -template -void LALR1Parser::generate_lalr1_parsing_table() { +template +void Lalr1Parser::generate_lalr1_parsing_table() { generate_lalr1_goto(); generate_lalr1_action(); } -template -void LALR1Parser::generate_lalr1_goto() { +template +void Lalr1Parser::generate_lalr1_goto() { // done already at end of generate_lr1_item_sets()? } // Dragon book page 253 -template -void LALR1Parser::generate_lalr1_action() { +template +void Lalr1Parser::generate_lalr1_action() { for (std::map, std::unique_ptr>::value_type const& kv : m_lr1_item_sets) { ItemSet* item_set_ptr = kv.second.get(); @@ -518,8 +518,8 @@ void LALR1Parser::generate_lalr1_action() { } } -template -auto LALR1Parser::get_input_after_last_newline( +template +auto Lalr1Parser::get_input_after_last_newline( std::stack& parse_stack_matches ) -> std::string { std::string error_message_reversed; @@ -557,8 +557,8 @@ auto LALR1Parser::get_input_after_last_newline( return error_message_reversed; } -template -auto LALR1Parser::get_input_until_next_newline(Token* error_token +template +auto Lalr1Parser::get_input_until_next_newline(Token* error_token ) -> std::string { std::string rest_of_line; bool next_is_end_token = (error_token->m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd); @@ -577,8 +577,8 @@ auto LALR1Parser::get_input_until_next_newline(Token return rest_of_line; } -template -auto LALR1Parser::report_error() -> std::string { +template +auto Lalr1Parser::report_error() -> std::string { assert(m_next_token == std::nullopt); assert(!m_parse_stack_matches.empty()); MatchedSymbol top_symbol = std::move(m_parse_stack_matches.top()); @@ -604,7 +604,7 @@ auto LALR1Parser::report_error() -> std::string { if (action.index() != 0) { error_type += "'"; if (auto* regex_ast_literal - = dynamic_cast*>( + = dynamic_cast*>( this->m_lexer.get_rule(i) )) { @@ -628,8 +628,8 @@ auto LALR1Parser::report_error() -> std::string { return error_string; } -template -auto LALR1Parser::parse(Reader& reader) -> NonTerminal { +template +auto Lalr1Parser::parse(Reader& reader) -> NonTerminal { reset(); m_parse_stack_states.push(m_root_item_set_ptr); bool accept = false; @@ -650,8 +650,8 @@ auto LALR1Parser::parse(Reader& reader) -> NonTermin return std::move(std::get(m)); } -template -void LALR1Parser::reset() { +template +void Lalr1Parser::reset() { m_next_token = std::nullopt; while (!m_parse_stack_states.empty()) { m_parse_stack_states.pop(); @@ -663,8 +663,8 @@ void LALR1Parser::reset() { this->m_lexer.reset(); } -template -auto LALR1Parser::get_next_symbol() -> Token { +template +auto Lalr1Parser::get_next_symbol() -> Token { if (m_next_token == std::nullopt) { Token token; if (ErrorCode error = this->m_lexer.scan(m_input_buffer, token); @@ -679,8 +679,8 @@ auto LALR1Parser::get_next_symbol() -> Token { return s; } -template -auto LALR1Parser::parse_advance(Token& next_token, bool* accept) +template +auto Lalr1Parser::parse_advance(Token& next_token, bool* accept) -> bool { for (auto const type : *next_token.m_type_ids_ptr) { if (parse_symbol(type, next_token, accept)) { @@ -693,8 +693,8 @@ auto LALR1Parser::parse_advance(Token& next_token, b return true; } -template -auto LALR1Parser::parse_symbol( +template +auto Lalr1Parser::parse_symbol( uint32_t const& type_id, Token& next_token, bool* accept diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 0cfee645..a392502a 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -19,7 +20,7 @@ #include namespace log_surgeon { -template +template class Lexer { public: static inline std::vector const cTokenEndTypes = {(uint32_t)SymbolId::TokenEnd}; @@ -28,11 +29,11 @@ class Lexer { /** * Generate a DFA from an NFA - * @param finite_automata::Nfa nfa - * @return std::unique_ptr> + * @param finite_automata::Nfa nfa + * @return std::unique_ptr> */ - static auto nfa_to_dfa(finite_automata::Nfa& nfa - ) -> std::unique_ptr>; + static auto nfa_to_dfa(finite_automata::Nfa& nfa + ) -> std::unique_ptr>; /** * Add a delimiters line from the schema to the lexer @@ -45,15 +46,17 @@ class Lexer { * @param id * @param regex */ - auto add_rule(uint32_t const& id, std::unique_ptr> rule) - -> void; + auto add_rule( + uint32_t const& id, + std::unique_ptr> rule + ) -> void; /** * Return regex pattern for a rule name * @param variable_id * @return finite_automata::RegexAST* */ - auto get_rule(uint32_t variable_id) -> finite_automata::RegexAST*; + auto get_rule(uint32_t variable_id) -> finite_automata::RegexAST*; /** * Generate DFA for lexer @@ -123,7 +126,7 @@ class Lexer { } [[nodiscard]] auto get_dfa( - ) const -> std::unique_ptr> const& { + ) const -> std::unique_ptr> const& { return m_dfa; } @@ -135,7 +138,7 @@ class Lexer { * Return epsilon_closure over m_epsilon_transitions * @return */ - static auto epsilon_closure(NfaStateType const* state_ptr) -> std::set; + static auto epsilon_closure(TypedNfaState const* state_ptr) -> std::set; /** * Get next character from the input buffer @@ -153,17 +156,17 @@ class Lexer { std::set m_type_ids_set; std::array m_is_delimiter{false}; std::array m_is_first_char{false}; - std::vector> m_rules; + std::vector> m_rules; uint32_t m_line{0}; bool m_has_delimiters{false}; - std::unique_ptr> m_dfa; + std::unique_ptr> m_dfa; bool m_asked_for_more_data{false}; - DfaStateType const* m_prev_state{nullptr}; + TypedDfaState const* m_prev_state{nullptr}; }; namespace lexers { -using ByteLexer = Lexer; -using Utf8Lexer = Lexer; +using ByteLexer = Lexer; +using Utf8Lexer = Lexer; } // namespace lexers } // namespace log_surgeon diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index c5feb85f..bcd3f9ae 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -17,8 +17,8 @@ * 4 byte: 0x10000 - 0x1FFFFF : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ namespace log_surgeon { -template -void Lexer::flip_states(uint32_t old_storage_size) { +template +void Lexer::flip_states(uint32_t old_storage_size) { if (m_match_pos >= old_storage_size / 2) { m_match_pos -= old_storage_size / 2; } else { @@ -38,10 +38,10 @@ void Lexer::flip_states(uint32_t old_storage_size) { } } -template -auto Lexer::scan(ParserInputBuffer& input_buffer, Token& token) +template +auto Lexer::scan(ParserInputBuffer& input_buffer, Token& token) -> ErrorCode { - DfaStateType const* state = m_dfa->get_root(); + TypedDfaState const* state = m_dfa->get_root(); if (m_asked_for_more_data) { state = m_prev_state; m_asked_for_more_data = false; @@ -80,7 +80,7 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To m_match_pos = prev_byte_buf_pos; m_match_line = m_line; } - DfaStateType* next = state->next(next_char); + TypedDfaState* next = state->next(next_char); if (next_char == '\n') { m_line++; if (m_has_delimiters && !m_match) { @@ -166,13 +166,13 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To } // TODO: this is duplicating almost all the code of scan() -template -auto Lexer::scan_with_wildcard( +template +auto Lexer::scan_with_wildcard( ParserInputBuffer& input_buffer, char wildcard, Token& token ) -> ErrorCode { - DfaStateType const* state = m_dfa->get_root(); + TypedDfaState const* state = m_dfa->get_root(); if (m_asked_for_more_data) { state = m_prev_state; m_asked_for_more_data = false; @@ -211,7 +211,7 @@ auto Lexer::scan_with_wildcard( m_match_pos = prev_byte_buf_pos; m_match_line = m_line; } - DfaStateType const* next = state->next(next_char); + TypedDfaState const* next = state->next(next_char); if (next_char == '\n') { m_line++; if (m_has_delimiters && !m_match) { @@ -239,7 +239,7 @@ auto Lexer::scan_with_wildcard( // BFS (keep track of m_type_ids) if (wildcard == '?') { for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { - DfaStateType* next_state = state->next(byte); + TypedDfaState* next_state = state->next(byte); if (next_state->is_accepting() == false) { token = Token{m_last_match_pos, @@ -252,11 +252,11 @@ auto Lexer::scan_with_wildcard( } } } else if (wildcard == '*') { - std::stack unvisited_states; - std::set visited_states; + std::stack unvisited_states; + std::set visited_states; unvisited_states.push(state); while (!unvisited_states.empty()) { - DfaStateType const* current_state = unvisited_states.top(); + TypedDfaState const* current_state = unvisited_states.top(); if (current_state == nullptr || current_state->is_accepting() == false) { token = Token{m_last_match_pos, @@ -273,7 +273,7 @@ auto Lexer::scan_with_wildcard( if (m_is_delimiter[byte]) { continue; } - DfaStateType const* next_state = current_state->next(byte); + TypedDfaState const* next_state = current_state->next(byte); if (visited_states.find(next_state) == visited_states.end()) { unvisited_states.push(next_state); } @@ -299,8 +299,8 @@ auto Lexer::scan_with_wildcard( } } -template -auto Lexer::increase_buffer_capacity(ParserInputBuffer& input_buffer +template +auto Lexer::increase_buffer_capacity(ParserInputBuffer& input_buffer ) -> void { uint32_t old_storage_size{0}; bool flipped_static_buffer{false}; @@ -316,8 +316,8 @@ auto Lexer::increase_buffer_capacity(ParserInputBuff } } -template -void Lexer::reset() { +template +void Lexer::reset() { m_last_match_pos = 0; m_match = false; m_line = 0; @@ -330,8 +330,8 @@ void Lexer::reset() { m_prev_state = nullptr; } -template -void Lexer::prepend_start_of_file_char(ParserInputBuffer& input_buffer +template +void Lexer::prepend_start_of_file_char(ParserInputBuffer& input_buffer ) { m_prev_state = m_dfa->get_root()->next(utf8::cCharStartOfFile); m_asked_for_more_data = true; @@ -341,8 +341,8 @@ void Lexer::prepend_start_of_file_char(ParserInputBu m_type_ids = nullptr; } -template -void Lexer::add_delimiters(std::vector const& delimiters) { +template +void Lexer::add_delimiters(std::vector const& delimiters) { assert(!delimiters.empty()); m_has_delimiters = true; for (bool& i : m_is_delimiter) { @@ -354,17 +354,17 @@ void Lexer::add_delimiters(std::vector con m_is_delimiter[utf8::cCharStartOfFile] = true; } -template -void Lexer::add_rule( +template +void Lexer::add_rule( uint32_t const& id, - std::unique_ptr> rule + std::unique_ptr> rule ) { m_rules.emplace_back(id, std::move(rule)); } -template -auto Lexer::get_rule(uint32_t const variable_id -) -> finite_automata::RegexAST* { +template +auto Lexer::get_rule(uint32_t const variable_id +) -> finite_automata::RegexAST* { for (auto const& rule : m_rules) { if (rule.get_variable_id() == variable_id) { return rule.get_regex(); @@ -373,12 +373,12 @@ auto Lexer::get_rule(uint32_t const variable_id return nullptr; } -template -void Lexer::generate() { - finite_automata::Nfa nfa{std::move(m_rules)}; +template +void Lexer::generate() { + finite_automata::Nfa nfa{std::move(m_rules)}; // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" m_dfa = nfa_to_dfa(nfa); - DfaStateType const* state = m_dfa->get_root(); + TypedDfaState const* state = m_dfa->get_root(); for (uint32_t i = 0; i < cSizeOfByte; i++) { if (state->next(i) != nullptr) { m_is_first_char[i] = true; @@ -388,11 +388,11 @@ void Lexer::generate() { } } -template -auto Lexer::epsilon_closure(NfaStateType const* state_ptr -) -> std::set { - std::set closure_set; - std::stack stack; +template +auto Lexer::epsilon_closure(TypedNfaState const* state_ptr +) -> std::set { + std::set closure_set; + std::stack stack; stack.push(state_ptr); while (!stack.empty()) { auto const* current_state = stack.top(); @@ -425,17 +425,17 @@ auto Lexer::epsilon_closure(NfaStateType const* stat return closure_set; } -template -auto Lexer::nfa_to_dfa(finite_automata::Nfa& nfa -) -> std::unique_ptr> { - typedef std::set StateSet; - std::unique_ptr> dfa - = std::make_unique>(); - std::map dfa_states; +template +auto Lexer::nfa_to_dfa(finite_automata::Nfa& nfa +) -> std::unique_ptr> { + typedef std::set StateSet; + std::unique_ptr> dfa + = std::make_unique>(); + std::map dfa_states; std::stack unmarked_sets; auto create_dfa_state - = [&dfa, &dfa_states, &unmarked_sets](StateSet const& set) -> DfaStateType* { - DfaStateType* state = dfa->new_state(set); + = [&dfa, &dfa_states, &unmarked_sets](StateSet const& set) -> TypedDfaState* { + TypedDfaState* state = dfa->new_state(set); dfa_states[set] = state; unmarked_sets.push(set); return state; @@ -445,29 +445,28 @@ auto Lexer::nfa_to_dfa(finite_automata::Nfa ascii_transitions_map; // map transitions_map; - for (NfaStateType const* s0 : set) { + for (TypedNfaState const* s0 : set) { for (uint32_t i = 0; i < cSizeOfByte; i++) { - for (NfaStateType* const s1 : s0->get_byte_transitions(i)) { + for (TypedNfaState* const s1 : s0->get_byte_transitions(i)) { StateSet closure = epsilon_closure(s1); ascii_transitions_map[i].insert(closure.begin(), closure.end()); } } // TODO: add this for the utf8 case /* - for (const typename NfaStateType::Tree::Data& data : s0->get_tree_transitions().all()) { - for (NfaStateType* const s1 : data.m_value) { - StateSet closure = epsilon_closure(s1); + for (const typename TypedNfaState::Tree::Data& data : s0->get_tree_transitions().all()) + { for (TypedNfaState* const s1 : data.m_value) { StateSet closure = epsilon_closure(s1); transitions_map[data.m_interval].insert(closure.begin(), closure.end()); } } */ } auto next_dfa_state - = [&dfa_states, &create_dfa_state](StateSet const& set) -> DfaStateType* { - DfaStateType* state{nullptr}; + = [&dfa_states, &create_dfa_state](StateSet const& set) -> TypedDfaState* { + TypedDfaState* state{nullptr}; auto it = dfa_states.find(set); if (it == dfa_states.end()) { state = create_dfa_state(set); @@ -477,15 +476,15 @@ auto Lexer::nfa_to_dfa(finite_automata::Nfa::value_type const& kv : ascii_transitions_map) { - DfaStateType* dest_state = next_dfa_state(kv.second); + TypedDfaState* dest_state = next_dfa_state(kv.second); dfa_state->add_byte_transition(kv.first, dest_state); } // TODO: add this for the utf8 case /* - for (const typename map::value_type& kv : + for (const typename map::value_type& kv : transitions_map) { - DfaStateType* dest_state = next_dfa_state(kv.second); + TypedDfaState* dest_state = next_dfa_state(kv.second); dfa_state->add_tree_transition(kv.first, dest_state); } */ diff --git a/src/log_surgeon/LexicalRule.hpp b/src/log_surgeon/LexicalRule.hpp index f5f266b3..6ab7e861 100644 --- a/src/log_surgeon/LexicalRule.hpp +++ b/src/log_surgeon/LexicalRule.hpp @@ -6,13 +6,13 @@ #include namespace log_surgeon { -template +template class LexicalRule { public: // Constructor LexicalRule( uint32_t const variable_id, - std::unique_ptr> regex + std::unique_ptr> regex ) : m_variable_id(variable_id), m_regex(std::move(regex)) {} @@ -21,22 +21,22 @@ class LexicalRule { * Adds AST representing the lexical rule to the NFA * @param nfa */ - auto add_to_nfa(finite_automata::Nfa* nfa) const -> void; + auto add_to_nfa(finite_automata::Nfa* nfa) const -> void; [[nodiscard]] auto get_variable_id() const -> uint32_t { return m_variable_id; } - [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { + [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { // TODO: make the returned pointer constant return m_regex.get(); } private: uint32_t m_variable_id; - std::unique_ptr> m_regex; + std::unique_ptr> m_regex; }; -template -void LexicalRule::add_to_nfa(finite_automata::Nfa* nfa) const { +template +void LexicalRule::add_to_nfa(finite_automata::Nfa* nfa) const { auto* end_state = nfa->new_state(); end_state->set_accepting(true); end_state->set_matching_variable_id(m_variable_id); diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index be680c5d..89d3abf6 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -17,8 +17,8 @@ using std::unique_ptr; using std::vector; namespace log_surgeon { -using finite_automata::DfaByteState; -using finite_automata::NfaByteState; +using finite_automata::ByteDfaState; +using finite_automata::ByteNfaState; using finite_automata::RegexAST; using finite_automata::RegexASTCat; using finite_automata::RegexASTGroup; @@ -62,24 +62,24 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { for (unique_ptr const& parser_ast : schema_ast->m_schema_vars) { auto* rule = dynamic_cast(parser_ast.get()); if (rule->m_name == "timestamp") { - unique_ptr> first_timestamp_regex_ast(rule->m_regex_ptr->clone() + unique_ptr> first_timestamp_regex_ast(rule->m_regex_ptr->clone() ); - unique_ptr> r1 - = make_unique>(utf8::cCharStartOfFile); + unique_ptr> r1 + = make_unique>(utf8::cCharStartOfFile); add_rule( "firstTimestamp", - make_unique>( + make_unique>( std::move(r1), std::move(first_timestamp_regex_ast) ) ); - unique_ptr> newline_timestamp_regex_ast(rule->m_regex_ptr->clone( + unique_ptr> newline_timestamp_regex_ast(rule->m_regex_ptr->clone( )); - unique_ptr> r2 - = make_unique>('\n'); + unique_ptr> r2 + = make_unique>('\n'); add_rule( "newLineTimestamp", - make_unique>( + make_unique>( std::move(r2), std::move(newline_timestamp_regex_ast) ) @@ -140,9 +140,9 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { } // For log-specific lexing: modify variable regex to contain a delimiter at the start. - unique_ptr> delimiter_group - = make_unique>(RegexASTGroup(delimiters)); - rule->m_regex_ptr = make_unique>( + unique_ptr> delimiter_group + = make_unique>(RegexASTGroup(delimiters)); + rule->m_regex_ptr = make_unique>( std::move(delimiter_group), std::move(rule->m_regex_ptr) ); @@ -193,7 +193,7 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { // make a message with just the '\n' character next_token.m_end_pos = next_token.m_start_pos + 1; next_token.m_type_ids_ptr - = &Lexer::cTokenUncaughtStringTypes; + = &Lexer::cTokenUncaughtStringTypes; output_buffer->set_token(1, next_token); output_buffer->set_pos(2); m_input_buffer.set_consumed_pos(next_token.m_start_pos); @@ -259,7 +259,7 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { Token curr_token = output_buffer->get_curr_token(); curr_token.m_end_pos = curr_token.m_start_pos + 1; curr_token.m_type_ids_ptr - = &Lexer::cTokenUncaughtStringTypes; + = &Lexer::cTokenUncaughtStringTypes; output_buffer->set_curr_token(curr_token); if (0 == m_start_of_log_message.m_start_pos) { m_input_buffer.set_consumed_pos(m_input_buffer.storage().size() - 1); diff --git a/src/log_surgeon/LogParser.hpp b/src/log_surgeon/LogParser.hpp index 14d77f88..7605fe5f 100644 --- a/src/log_surgeon/LogParser.hpp +++ b/src/log_surgeon/LogParser.hpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include @@ -15,7 +15,7 @@ namespace log_surgeon { // TODO: Compare c-array vs. vectors (its underlying array) for buffers -class LogParser : public Parser { +class LogParser : public Parser { public: enum class ParsingAction { None, @@ -26,7 +26,7 @@ class LogParser : public Parser schema_ast); diff --git a/src/log_surgeon/Parser.hpp b/src/log_surgeon/Parser.hpp index 0caf4916..37d5734e 100644 --- a/src/log_surgeon/Parser.hpp +++ b/src/log_surgeon/Parser.hpp @@ -5,19 +5,19 @@ namespace log_surgeon { -template +template class Parser { public: Parser(); virtual auto add_rule( std::string const& name, - std::unique_ptr> rule + std::unique_ptr> rule ) -> void; auto add_token(std::string const& name, char rule_char) -> void; - Lexer m_lexer; + Lexer m_lexer; }; } // namespace log_surgeon diff --git a/src/log_surgeon/Parser.tpp b/src/log_surgeon/Parser.tpp index 8d60ce77..4747072e 100644 --- a/src/log_surgeon/Parser.tpp +++ b/src/log_surgeon/Parser.tpp @@ -7,8 +7,8 @@ namespace log_surgeon { -template -Parser::Parser() { +template +Parser::Parser() { // TODO move clp-reserved symbols out of the parser m_lexer.m_symbol_id[cTokenEnd] = (uint32_t)SymbolId::TokenEnd; m_lexer.m_symbol_id[cTokenUncaughtString] = (uint32_t)SymbolId::TokenUncaughtString; @@ -29,10 +29,10 @@ Parser::Parser() { m_lexer.m_id_symbol[(uint32_t)SymbolId::TokenNewline] = cTokenNewline; } -template -void Parser::add_rule( +template +void Parser::add_rule( std::string const& name, - std::unique_ptr> rule + std::unique_ptr> rule ) { if (m_lexer.m_symbol_id.find(name) == m_lexer.m_symbol_id.end()) { m_lexer.m_symbol_id[name] = m_lexer.m_symbol_id.size(); @@ -41,9 +41,9 @@ void Parser::add_rule( m_lexer.add_rule(m_lexer.m_symbol_id[name], std::move(rule)); } -template -void Parser::add_token(std::string const& name, char rule_char) { - add_rule(name, std::make_unique>(rule_char)); +template +void Parser::add_token(std::string const& name, char rule_char) { + add_rule(name, std::make_unique>(rule_char)); } } // namespace log_surgeon diff --git a/src/log_surgeon/ReaderParser.hpp b/src/log_surgeon/ReaderParser.hpp index 9465efbd..805cd7b4 100644 --- a/src/log_surgeon/ReaderParser.hpp +++ b/src/log_surgeon/ReaderParser.hpp @@ -19,7 +19,7 @@ class ReaderParser { /** * Constructs the parser using the the given schema file. * @param schema_file_path - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure parsing the schema file or processing the schema * AST. */ @@ -28,7 +28,7 @@ class ReaderParser { /** * Constructs the parser using the given schema AST. * @param schema_ast - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure processing the schema AST. */ explicit ReaderParser(std::unique_ptr schema_ast); diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index c7c5e6a4..3c7bd2c1 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -11,30 +11,30 @@ #include #include #include -#include +#include #include #include using ParserValueRegex = log_surgeon::ParserValue>>; + log_surgeon::finite_automata::RegexAST>>; using RegexASTByte - = log_surgeon::finite_automata::RegexAST; + = log_surgeon::finite_automata::RegexAST; using RegexASTGroupByte - = log_surgeon::finite_automata::RegexASTGroup; + = log_surgeon::finite_automata::RegexASTGroup; using RegexASTIntegerByte - = log_surgeon::finite_automata::RegexASTInteger; + = log_surgeon::finite_automata::RegexASTInteger; using RegexASTLiteralByte - = log_surgeon::finite_automata::RegexASTLiteral; + = log_surgeon::finite_automata::RegexASTLiteral; using RegexASTMultiplicationByte = log_surgeon::finite_automata::RegexASTMultiplication< - log_surgeon::finite_automata::NfaByteState>; + log_surgeon::finite_automata::ByteNfaState>; using RegexASTOrByte - = log_surgeon::finite_automata::RegexASTOr; + = log_surgeon::finite_automata::RegexASTOr; using RegexASTCatByte - = log_surgeon::finite_automata::RegexASTCat; + = log_surgeon::finite_automata::RegexASTCat; using RegexASTCaptureByte - = log_surgeon::finite_automata::RegexASTCapture; + = log_surgeon::finite_automata::RegexASTCapture; using RegexASTEmptyByte - = log_surgeon::finite_automata::RegexASTEmpty; + = log_surgeon::finite_automata::RegexASTEmpty; using std::make_unique; using std::string; diff --git a/src/log_surgeon/SchemaParser.hpp b/src/log_surgeon/SchemaParser.hpp index 748b94a0..36db6119 100644 --- a/src/log_surgeon/SchemaParser.hpp +++ b/src/log_surgeon/SchemaParser.hpp @@ -5,7 +5,7 @@ #include #include -#include +#include namespace log_surgeon { // ASTs used in SchemaParser AST @@ -46,7 +46,7 @@ class SchemaVarAST : public ParserAST { // Constructor SchemaVarAST( std::string name, - std::unique_ptr> regex_ptr, + std::unique_ptr> regex_ptr, uint32_t line_num ) : m_line_num(line_num), @@ -55,7 +55,7 @@ class SchemaVarAST : public ParserAST { uint32_t m_line_num; std::string m_name; - std::unique_ptr> m_regex_ptr; + std::unique_ptr> m_regex_ptr; }; class DelimiterStringAST : public ParserAST { @@ -69,7 +69,7 @@ class DelimiterStringAST : public ParserAST { }; class SchemaParser - : public LALR1Parser { + : public Lalr1Parser { public: /** * File wrapper around generate_schema_ast() diff --git a/src/log_surgeon/finite_automata/Dfa.hpp b/src/log_surgeon/finite_automata/Dfa.hpp index 125cd3e4..5d425dd2 100644 --- a/src/log_surgeon/finite_automata/Dfa.hpp +++ b/src/log_surgeon/finite_automata/Dfa.hpp @@ -9,40 +9,38 @@ #include namespace log_surgeon::finite_automata { -template +template class Dfa { public: /** - * Creates a new DFA state based on a set of NFA states and adds it to - * m_states - * @param nfa_state_set - * @return DfaStateType* + * Creates a new DFA state based on a set of NFA states and adds it to `m_states`. + * @param nfa_state_set The set of NFA states represented by this DFA state. + * @return A pointer to the new DFA state. */ - template - auto new_state(std::set const& nfa_state_set) -> DfaStateType*; + template + auto new_state(std::set const& nfa_state_set) -> TypedDfaState*; - auto get_root() const -> DfaStateType const* { return m_states.at(0).get(); } + auto get_root() const -> TypedDfaState const* { return m_states.at(0).get(); } /** - * Compares this dfa with dfa_in to determine the set of schema types in - * this dfa that are reachable by any type in dfa_in. A type is considered - * reachable if there is at least one string for which: (1) this dfa returns - * a set of types containing the type, and (2) dfa_in returns any non-empty - * set of types. - * @param dfa_in - * @return The set of schema types reachable by dfa_in + * Compares this dfa with `dfa_in` to determine the set of schema types in this dfa that are + * reachable by any type in `dfa_in`. A type is considered reachable if there is at least one + * string for which: (1) this dfa returns a set of types containing the type, and (2) `dfa_in` + * returns any non-empty set of types. + * @param dfa_in The dfa with which to take the intersect. + * @return The set of schema types reachable by `dfa_in`. */ - [[nodiscard]] auto get_intersect(std::unique_ptr const& dfa_in - ) const -> std::set; + [[nodiscard]] auto get_intersect(Dfa const* dfa_in) const -> std::set; private: - std::vector> m_states; + std::vector> m_states; }; -template -template -auto Dfa::new_state(std::set const& nfa_state_set) -> DfaStateType* { - m_states.emplace_back(std::make_unique()); +template +template +auto Dfa::new_state(std::set const& nfa_state_set +) -> TypedDfaState* { + m_states.emplace_back(std::make_unique()); auto* dfa_state = m_states.back().get(); for (auto const* nfa_state : nfa_state_set) { if (nfa_state->is_accepting()) { @@ -52,12 +50,11 @@ auto Dfa::new_state(std::set const& nfa_state_set) return dfa_state; } -template -auto Dfa::get_intersect(std::unique_ptr const& dfa_in -) const -> std::set { +template +auto Dfa::get_intersect(Dfa const* dfa_in) const -> std::set { std::set schema_types; - std::set> unvisited_pairs; - std::set> visited_pairs; + std::set> unvisited_pairs; + std::set> visited_pairs; unvisited_pairs.emplace(get_root(), dfa_in->get_root()); // TODO: Handle UTF-8 (multi-byte transitions) as well while (false == unvisited_pairs.empty()) { diff --git a/src/log_surgeon/finite_automata/DfaState.hpp b/src/log_surgeon/finite_automata/DfaState.hpp index 7345843c..f25b25ac 100644 --- a/src/log_surgeon/finite_automata/DfaState.hpp +++ b/src/log_surgeon/finite_automata/DfaState.hpp @@ -5,8 +5,10 @@ #include #include #include +#include #include +#include #include #include @@ -14,14 +16,16 @@ namespace log_surgeon::finite_automata { template class DfaState; -using DfaByteState = DfaState; -using DfaUtf8State = DfaState; +using ByteDfaState = DfaState; +using Utf8DfaState = DfaState; -template +template class DfaState { public: using Tree = UnicodeIntervalTree; + DfaState() { std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr); } + auto add_matching_variable_id(uint32_t const variable_id) -> void { m_matching_variable_ids.push_back(variable_id); } @@ -30,30 +34,31 @@ class DfaState { return m_matching_variable_ids; } - [[nodiscard]] auto is_accepting() const -> bool { return !m_matching_variable_ids.empty(); } + [[nodiscard]] auto is_accepting() const -> bool { + return false == m_matching_variable_ids.empty(); + } auto add_byte_transition(uint8_t const& byte, DfaState* dest_state) -> void { m_bytes_transition[byte] = dest_state; } /** - * Returns the next state the DFA transitions to on input character (byte or utf8). - * @param character - * @return DfaState* + * @param character The character (byte or utf8) to transition on. + * @return A pointer to the DFA state reached after transitioning on `character`. */ [[nodiscard]] auto next(uint32_t character) const -> DfaState*; private: std::vector m_matching_variable_ids; DfaState* m_bytes_transition[cSizeOfByte]; - // NOTE: We don't need m_tree_transitions for the `stateType == DfaStateType::Byte` case, - // so we use an empty class (`std::tuple<>`) in that case. - std::conditional_t> m_tree_transitions; + // NOTE: We don't need m_tree_transitions for the `state_type == DfaStateType::Byte` case, so we + // use an empty class (`std::tuple<>`) in that case. + std::conditional_t> m_tree_transitions; }; -template -auto DfaState::next(uint32_t character) const -> DfaState* { - if constexpr (DfaStateType::Byte == stateType) { +template +auto DfaState::next(uint32_t character) const -> DfaState* { + if constexpr (DfaStateType::Byte == state_type) { return m_bytes_transition[character]; } else { if (character < cSizeOfByte) { @@ -62,7 +67,7 @@ auto DfaState::next(uint32_t character) const -> DfaState* { std::unique_ptr> result = m_tree_transitions.find(Interval(character, character)); assert(result->size() <= 1); - if (!result->empty()) { + if (false == result->empty()) { return result->front().m_value; } return nullptr; diff --git a/src/log_surgeon/finite_automata/DfaStatePair.hpp b/src/log_surgeon/finite_automata/DfaStatePair.hpp index 358f396f..67ecb622 100644 --- a/src/log_surgeon/finite_automata/DfaStatePair.hpp +++ b/src/log_surgeon/finite_automata/DfaStatePair.hpp @@ -1,10 +1,11 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_DFA_STATE_PAIR #define LOG_SURGEON_FINITE_AUTOMATA_DFA_STATE_PAIR +#include #include #include -#include +#include namespace log_surgeon::finite_automata { /** @@ -18,10 +19,10 @@ namespace log_surgeon::finite_automata { * * NOTE: Only the first state in the pair contains the variable types matched by the pair. */ -template +template class DfaStatePair { public: - DfaStatePair(DfaState const* state1, DfaState const* state2) + DfaStatePair(TypedDfaState const* state1, TypedDfaState const* state2) : m_state1(state1), m_state2(state2) {}; @@ -58,12 +59,12 @@ class DfaStatePair { } private: - DfaState const* m_state1; - DfaState const* m_state2; + TypedDfaState const* m_state1; + TypedDfaState const* m_state2; }; -template -auto DfaStatePair::get_reachable_pairs( +template +auto DfaStatePair::get_reachable_pairs( std::set& visited_pairs, std::set& unvisited_pairs ) const -> void { diff --git a/src/log_surgeon/finite_automata/Nfa.hpp b/src/log_surgeon/finite_automata/Nfa.hpp index caf58ce4..8eaaaadd 100644 --- a/src/log_surgeon/finite_automata/Nfa.hpp +++ b/src/log_surgeon/finite_automata/Nfa.hpp @@ -17,18 +17,18 @@ #include namespace log_surgeon::finite_automata { -template +template class Nfa { public: - using StateVec = std::vector; + using StateVec = std::vector; - explicit Nfa(std::vector> rules); + explicit Nfa(std::vector> rules); /** * Creates a unique_ptr for an NFA state with no tagged transitions and adds it to `m_states`. - * @return NfaStateType* + * @return TypedNfaState* */ - [[nodiscard]] auto new_state() -> NfaStateType*; + [[nodiscard]] auto new_state() -> TypedNfaState*; /** * Creates a unique_ptr for an NFA state with a positive tagged end transition and adds it to @@ -39,20 +39,20 @@ class Nfa { */ [[nodiscard]] auto new_state_with_positive_tagged_end_transition( Tag const* tag, - NfaStateType const* dest_state - ) -> NfaStateType*; + TypedNfaState const* dest_state + ) -> TypedNfaState*; /** * Creates a unique_ptr for an NFA state with a negative tagged transition and adds it to * `m_states`. * @param tags * @param dest_state - * @return NfaStateType* + * @return TypedNfaState* */ [[nodiscard]] auto new_state_with_negative_tagged_transition( std::vector tags, - NfaStateType const* dest_state - ) -> NfaStateType*; + TypedNfaState const* dest_state + ) -> TypedNfaState*; /** * Creates the start and end states for a capture group. @@ -64,38 +64,38 @@ class Nfa { */ [[nodiscard]] auto new_start_and_end_states_with_positive_tagged_transitions( Tag const* tag, - NfaStateType const* dest_state - ) -> std::pair; + TypedNfaState const* dest_state + ) -> std::pair; /** * @return A vector representing the traversal order of the NFA states using breadth-first * search (BFS). */ - [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; + [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; /** * @return A string representation of the NFA. */ [[nodiscard]] auto serialize() const -> std::string; - auto add_root_interval(Interval interval, NfaStateType* dest_state) -> void { + auto add_root_interval(Interval interval, TypedNfaState* dest_state) -> void { m_root->add_interval(interval, dest_state); } - auto set_root(NfaStateType* root) -> void { m_root = root; } + auto set_root(TypedNfaState* root) -> void { m_root = root; } - auto get_root() -> NfaStateType* { return m_root; } + auto get_root() -> TypedNfaState* { return m_root; } private: - std::vector> m_states; - NfaStateType* m_root; + std::vector> m_states; + TypedNfaState* m_root; // Store the rules locally as they contain information needed by the NFA. E.g., transitions in // the NFA point to tags in the rule ASTs. - std::vector> m_rules; + std::vector> m_rules; }; -template -Nfa::Nfa(std::vector> rules) +template +Nfa::Nfa(std::vector> rules) : m_root{new_state()}, m_rules{std::move(rules)} { for (auto const& rule : m_rules) { @@ -103,35 +103,35 @@ Nfa::Nfa(std::vector> rules) } } -template -auto Nfa::new_state() -> NfaStateType* { - m_states.emplace_back(std::make_unique()); +template +auto Nfa::new_state() -> TypedNfaState* { + m_states.emplace_back(std::make_unique()); return m_states.back().get(); } -template -auto Nfa::new_state_with_positive_tagged_end_transition( +template +auto Nfa::new_state_with_positive_tagged_end_transition( Tag const* tag, - NfaStateType const* dest_state -) -> NfaStateType* { - m_states.emplace_back(std::make_unique(tag, dest_state)); + TypedNfaState const* dest_state +) -> TypedNfaState* { + m_states.emplace_back(std::make_unique(tag, dest_state)); return m_states.back().get(); } -template -auto Nfa::new_state_with_negative_tagged_transition( +template +auto Nfa::new_state_with_negative_tagged_transition( std::vector tags, - NfaStateType const* dest_state -) -> NfaStateType* { - m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); + TypedNfaState const* dest_state +) -> TypedNfaState* { + m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); return m_states.back().get(); } -template -auto Nfa::new_start_and_end_states_with_positive_tagged_transitions( +template +auto Nfa::new_start_and_end_states_with_positive_tagged_transitions( Tag const* tag, - NfaStateType const* dest_state -) -> std::pair { + TypedNfaState const* dest_state +) -> std::pair { auto* start_state = new_state(); m_root->add_positive_tagged_start_transition(tag, start_state); @@ -139,16 +139,16 @@ auto Nfa::new_start_and_end_states_with_positive_tagged_transition return {start_state, end_state}; } -template -auto Nfa::get_bfs_traversal_order() const -> std::vector { - std::queue state_queue; - std::unordered_set visited_states; - std::vector visited_order; +template +auto Nfa::get_bfs_traversal_order() const -> std::vector { + std::queue state_queue; + std::unordered_set visited_states; + std::vector visited_order; visited_states.reserve(m_states.size()); visited_order.reserve(m_states.size()); auto add_to_queue_and_visited - = [&state_queue, &visited_states](NfaStateType const* dest_state) { + = [&state_queue, &visited_states](TypedNfaState const* dest_state) { if (visited_states.insert(dest_state).second) { state_queue.push(dest_state); } @@ -190,11 +190,11 @@ auto Nfa::get_bfs_traversal_order() const -> std::vector -auto Nfa::serialize() const -> std::string { +template +auto Nfa::serialize() const -> std::string { auto const traversal_order = get_bfs_traversal_order(); - std::unordered_map state_ids; + std::unordered_map state_ids; for (auto const* state : traversal_order) { state_ids.emplace(state, state_ids.size()); } diff --git a/src/log_surgeon/finite_automata/NfaState.hpp b/src/log_surgeon/finite_automata/NfaState.hpp index 7c406dc2..339f38f0 100644 --- a/src/log_surgeon/finite_automata/NfaState.hpp +++ b/src/log_surgeon/finite_automata/NfaState.hpp @@ -20,8 +20,8 @@ namespace log_surgeon::finite_automata { template class NfaState; -using NfaByteState = NfaState; -using NfaUtf8State = NfaState; +using ByteNfaState = NfaState; +using Utf8NfaState = NfaState; template class NfaState { @@ -113,7 +113,7 @@ class NfaState { std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; // NOTE: We don't need m_tree_transitions for the `stateType == - // DfaStateType::Byte` case, so we use an empty class (`std::tuple<>`) + // NfaStateType::Byte` case, so we use an empty class (`std::tuple<>`) // in that case. std::conditional_t> m_tree_transitions; }; @@ -137,7 +137,7 @@ auto NfaState::add_interval(Interval interval, NfaState* dest_state) uint32_t overlap_low = std::max(data.m_interval.first, interval.first); uint32_t overlap_high = std::min(data.m_interval.second, interval.second); - std::vector tree_states = data.m_value; + std::vector tree_states = data.m_value; tree_states.push_back(dest_state); m_tree_transitions.insert(Interval(overlap_low, overlap_high), tree_states); if (data.m_interval.first < interval.first) { diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index ab88d805..e2de78aa 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -65,6 +65,11 @@ class PrefixTree { [[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); } + /** + * Gets the parent ID without checking if it's `std::nullopt`. + * NOTE: This method should only be used if the caller has checked the node is not the root. + * @return The ID of the parent node in the prefix tree. + */ [[nodiscard]] auto get_parent_id_unsafe() const -> id_t { // NOLINTNEXTLINE(bugprone-unchecked-optional-access) return m_parent_id.value(); diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 19345923..bb55f62d 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -24,7 +24,7 @@ #include namespace log_surgeon::finite_automata { -template +template class Nfa; // TODO: rename `RegexAST` to `RegexASTNode` @@ -40,9 +40,9 @@ class Nfa; * ASTs built using this class are assumed to be constructed in a bottom-up manner, where all * descendant nodes are created first. * - * @tparam NfaStateType Whether this AST is used for byte lexing or UTF-8 lexing. + * @tparam TypedNfaState Whether this AST is used for byte lexing or UTF-8 lexing. */ -template +template class RegexAST { public: RegexAST() = default; @@ -75,7 +75,7 @@ class RegexAST { * @param nfa * @param end_state */ - virtual auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void = 0; + virtual auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void = 0; /** * Serializes the AST with this node as the root. @@ -109,7 +109,7 @@ class RegexAST { * @param end_state */ auto - add_to_nfa_with_negative_tags(Nfa* nfa, NfaStateType* end_state) const -> void { + add_to_nfa_with_negative_tags(Nfa* nfa, TypedNfaState* end_state) const -> void { // Handle negative tags as: // root --(regex)--> state_with_negative_tagged_transition --(negative tags)--> end_state if (false == m_negative_tags.empty()) { @@ -155,10 +155,10 @@ class RegexAST { * repetition with a minimum repetition of 0. Namely, we treat `R{0,N}` as `R{1,N} | ∅`. Then, the * NFA handles the 0 repetition case using the logic in `RegexASTOR` (i.e., adding a negative * transition for every capture group matched in `R{1,N}`). - * @tparam NfaStateType Whether this AST is used for byte lexing or UTF-8 lexing. + * @tparam TypedNfaState Whether this AST is used for byte lexing or UTF-8 lexing. */ -template -class RegexASTEmpty : public RegexAST { +template +class RegexASTEmpty : public RegexAST { public: RegexASTEmpty() = default; @@ -178,8 +178,8 @@ class RegexASTEmpty : public RegexAST { } auto add_to_nfa( - [[maybe_unused]] Nfa* nfa, - [[maybe_unused]] NfaStateType* end_state + [[maybe_unused]] Nfa* nfa, + [[maybe_unused]] TypedNfaState* end_state ) const -> void override { // Do nothing as adding an empty node to the NFA is a null operation. } @@ -187,8 +187,8 @@ class RegexASTEmpty : public RegexAST { [[nodiscard]] auto serialize() const -> std::u32string override; }; -template -class RegexASTLiteral : public RegexAST { +template +class RegexASTLiteral : public RegexAST { public: explicit RegexASTLiteral(uint32_t character); @@ -226,7 +226,7 @@ class RegexASTLiteral : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; @@ -236,8 +236,8 @@ class RegexASTLiteral : public RegexAST { uint32_t m_character; }; -template -class RegexASTInteger : public RegexAST { +template +class RegexASTInteger : public RegexAST { public: explicit RegexASTInteger(uint32_t digit); @@ -279,7 +279,7 @@ class RegexASTInteger : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; @@ -291,24 +291,24 @@ class RegexASTInteger : public RegexAST { std::vector m_digits; }; -template -class RegexASTGroup : public RegexAST { +template +class RegexASTGroup : public RegexAST { public: using Range = std::pair; RegexASTGroup() = default; - explicit RegexASTGroup(RegexASTLiteral const* right); + explicit RegexASTGroup(RegexASTLiteral const* right); explicit RegexASTGroup(RegexASTGroup const* right); - RegexASTGroup(RegexASTGroup const* left, RegexASTLiteral const* right); + RegexASTGroup(RegexASTGroup const* left, RegexASTLiteral const* right); RegexASTGroup(RegexASTGroup const* left, RegexASTGroup const* right); RegexASTGroup( - RegexASTLiteral const* left, - RegexASTLiteral const* right + RegexASTLiteral const* left, + RegexASTLiteral const* right ); RegexASTGroup(uint32_t min, uint32_t max); @@ -387,7 +387,7 @@ class RegexASTGroup : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; @@ -424,20 +424,20 @@ class RegexASTGroup : public RegexAST { std::vector m_ranges; }; -template -class RegexASTOr : public RegexAST { +template +class RegexASTOr : public RegexAST { public: ~RegexASTOr() override = default; RegexASTOr( - std::unique_ptr> left, - std::unique_ptr> right + std::unique_ptr> left, + std::unique_ptr> right ); RegexASTOr(RegexASTOr const& rhs) - : RegexAST(rhs), - m_left(std::unique_ptr>(rhs.m_left->clone())), - m_right(std::unique_ptr>(rhs.m_right->clone())) {} + : RegexAST(rhs), + m_left(std::unique_ptr>(rhs.m_left->clone())), + m_right(std::unique_ptr>(rhs.m_right->clone())) {} /** * Used for cloning a unique_pointer of type RegexASTOr @@ -474,33 +474,33 @@ class RegexASTOr : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } + [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } - [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } + [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } private: - std::unique_ptr> m_left; - std::unique_ptr> m_right; + std::unique_ptr> m_left; + std::unique_ptr> m_right; }; -template -class RegexASTCat : public RegexAST { +template +class RegexASTCat : public RegexAST { public: ~RegexASTCat() override = default; RegexASTCat( - std::unique_ptr> left, - std::unique_ptr> right + std::unique_ptr> left, + std::unique_ptr> right ); RegexASTCat(RegexASTCat const& rhs) - : RegexAST(rhs), - m_left(std::unique_ptr>(rhs.m_left->clone())), - m_right(std::unique_ptr>(rhs.m_right->clone())) {} + : RegexAST(rhs), + m_left(std::unique_ptr>(rhs.m_left->clone())), + m_right(std::unique_ptr>(rhs.m_right->clone())) {} /** * Used for cloning a unique_pointer of type RegexASTCat @@ -537,33 +537,33 @@ class RegexASTCat : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } + [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } - [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } + [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } private: - std::unique_ptr> m_left; - std::unique_ptr> m_right; + std::unique_ptr> m_left; + std::unique_ptr> m_right; }; -template -class RegexASTMultiplication : public RegexAST { +template +class RegexASTMultiplication : public RegexAST { public: ~RegexASTMultiplication() override = default; RegexASTMultiplication( - std::unique_ptr> operand, + std::unique_ptr> operand, uint32_t min, uint32_t max ); RegexASTMultiplication(RegexASTMultiplication const& rhs) - : RegexAST(rhs), - m_operand(std::unique_ptr>(rhs.m_operand->clone())), + : RegexAST(rhs), + m_operand(std::unique_ptr>(rhs.m_operand->clone())), m_min(rhs.m_min), m_max(rhs.m_max) {} @@ -601,13 +601,13 @@ class RegexASTMultiplication : public RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; [[nodiscard]] auto is_infinite() const -> bool { return m_max == 0; } - [[nodiscard]] auto get_operand() const -> std::unique_ptr> const& { + [[nodiscard]] auto get_operand() const -> std::unique_ptr> const& { return m_operand; } @@ -616,7 +616,7 @@ class RegexASTMultiplication : public RegexAST { [[nodiscard]] auto get_max() const -> uint32_t { return m_max; } private: - std::unique_ptr> m_operand; + std::unique_ptr> m_operand; uint32_t m_min; uint32_t m_max; }; @@ -626,10 +626,10 @@ class RegexASTMultiplication : public RegexAST { * NOTE: * - `m_tag` is always expected to be non-null. * - `m_group_regex_ast` is always expected to be non-null. - * @tparam NfaStateType Specifies the type of transition (bytes or UTF-8 characters). + * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ -template -class RegexASTCapture : public RegexAST { +template +class RegexASTCapture : public RegexAST { public: ~RegexASTCapture() override = default; @@ -639,7 +639,7 @@ class RegexASTCapture : public RegexAST { * @throw std::invalid_argument if `group_regex_ast` or `tag` are `nullptr`. */ RegexASTCapture( - std::unique_ptr> group_regex_ast, + std::unique_ptr> group_regex_ast, std::unique_ptr tag ) : m_group_regex_ast{( @@ -649,19 +649,19 @@ class RegexASTCapture : public RegexAST { )}, m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : std::move(tag)} { - RegexAST::set_subtree_positive_tags( + RegexAST::set_subtree_positive_tags( m_group_regex_ast->get_subtree_positive_tags() ); - RegexAST::add_subtree_positive_tags({m_tag.get()}); + RegexAST::add_subtree_positive_tags({m_tag.get()}); } RegexASTCapture(RegexASTCapture const& rhs) - : RegexAST{rhs}, + : RegexAST{rhs}, m_group_regex_ast{ - std::unique_ptr>(rhs.m_group_regex_ast->clone()) + std::unique_ptr>(rhs.m_group_regex_ast->clone()) }, m_tag{std::make_unique(*rhs.m_tag)} { - RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); } /** @@ -697,159 +697,161 @@ class RegexASTCapture : public RegexAST { * @param nfa * @param dest_state */ - auto add_to_nfa(Nfa* nfa, NfaStateType* dest_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* dest_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; [[nodiscard]] auto get_group_name() const -> std::string_view { return m_tag->get_name(); } [[nodiscard]] auto get_group_regex_ast( - ) const -> std::unique_ptr> const& { + ) const -> std::unique_ptr> const& { return m_group_regex_ast; } private: - std::unique_ptr> m_group_regex_ast; + std::unique_ptr> m_group_regex_ast; std::unique_ptr m_tag; }; -template -[[nodiscard]] auto RegexASTEmpty::serialize() const -> std::u32string { - return fmt::format(U"{}", RegexAST::serialize_negative_tags()); +template +[[nodiscard]] auto RegexASTEmpty::serialize() const -> std::u32string { + return fmt::format(U"{}", RegexAST::serialize_negative_tags()); } -template -RegexASTLiteral::RegexASTLiteral(uint32_t character) : m_character(character) {} +template +RegexASTLiteral::RegexASTLiteral(uint32_t character) : m_character(character) {} -template -void RegexASTLiteral::add_to_nfa(Nfa* nfa, NfaStateType* end_state) +template +void RegexASTLiteral::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { nfa->add_root_interval(Interval(m_character, m_character), end_state); } -template -[[nodiscard]] auto RegexASTLiteral::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTLiteral::serialize() const -> std::u32string { return fmt::format( U"{}{}", static_cast(m_character), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTInteger::RegexASTInteger(uint32_t digit) { +template +RegexASTInteger::RegexASTInteger(uint32_t digit) { digit = digit - '0'; m_digits.push_back(digit); } -template -RegexASTInteger::RegexASTInteger(RegexASTInteger* left, uint32_t digit) +template +RegexASTInteger::RegexASTInteger(RegexASTInteger* left, uint32_t digit) : m_digits(std::move(left->m_digits)) { digit = digit - '0'; m_digits.push_back(digit); } -template -void RegexASTInteger::add_to_nfa( - [[maybe_unused]] Nfa* nfa, - [[maybe_unused]] NfaStateType* end_state +template +void RegexASTInteger::add_to_nfa( + [[maybe_unused]] Nfa* nfa, + [[maybe_unused]] TypedNfaState* end_state ) const { throw std::runtime_error("Unsupported"); } -template -[[nodiscard]] auto RegexASTInteger::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTInteger::serialize() const -> std::u32string { auto const digits_string = fmt::format("{}", fmt::join(m_digits, "")); return fmt::format( U"{}{}", std::u32string(digits_string.begin(), digits_string.end()), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTOr::RegexASTOr( - std::unique_ptr> left, - std::unique_ptr> right +template +RegexASTOr::RegexASTOr( + std::unique_ptr> left, + std::unique_ptr> right ) : m_left(std::move(left)), m_right(std::move(right)) { m_left->set_negative_tags(m_right->get_subtree_positive_tags()); m_right->set_negative_tags(m_left->get_subtree_positive_tags()); - RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); - RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); + RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); } -template -void RegexASTOr::add_to_nfa(Nfa* nfa, NfaStateType* end_state) const { +template +void RegexASTOr::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) + const { m_left->add_to_nfa_with_negative_tags(nfa, end_state); m_right->add_to_nfa_with_negative_tags(nfa, end_state); } -template -[[nodiscard]] auto RegexASTOr::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTOr::serialize() const -> std::u32string { return fmt::format( U"({})|({}){}", nullptr != m_left ? m_left->serialize() : U"null", nullptr != m_right ? m_right->serialize() : U"null", - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTCat::RegexASTCat( - std::unique_ptr> left, - std::unique_ptr> right +template +RegexASTCat::RegexASTCat( + std::unique_ptr> left, + std::unique_ptr> right ) : m_left(std::move(left)), m_right(std::move(right)) { - RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); - RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); + RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); } -template -void RegexASTCat::add_to_nfa(Nfa* nfa, NfaStateType* end_state) const { - NfaStateType* saved_root = nfa->get_root(); - NfaStateType* intermediate_state = nfa->new_state(); +template +void RegexASTCat::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) + const { + TypedNfaState* saved_root = nfa->get_root(); + TypedNfaState* intermediate_state = nfa->new_state(); m_left->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); m_right->add_to_nfa_with_negative_tags(nfa, end_state); nfa->set_root(saved_root); } -template -[[nodiscard]] auto RegexASTCat::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTCat::serialize() const -> std::u32string { return fmt::format( U"{}{}{}", nullptr != m_left ? m_left->serialize() : U"null", nullptr != m_right ? m_right->serialize() : U"null", - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTMultiplication::RegexASTMultiplication( - std::unique_ptr> operand, +template +RegexASTMultiplication::RegexASTMultiplication( + std::unique_ptr> operand, uint32_t const min, uint32_t const max ) : m_operand(std::move(operand)), m_min(min), m_max(max) { - RegexAST::set_subtree_positive_tags(m_operand->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(m_operand->get_subtree_positive_tags()); } -template -void RegexASTMultiplication::add_to_nfa( - Nfa* nfa, - NfaStateType* end_state +template +void RegexASTMultiplication::add_to_nfa( + Nfa* nfa, + TypedNfaState* end_state ) const { - NfaStateType* saved_root = nfa->get_root(); - if (0 == m_min) { + TypedNfaState* saved_root = nfa->get_root(); + if (m_min == 0) { nfa->get_root()->add_epsilon_transition(end_state); } else { for (uint32_t i = 1; i < m_min; i++) { - NfaStateType* intermediate_state = nfa->new_state(); + TypedNfaState* intermediate_state = nfa->new_state(); m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } @@ -859,14 +861,14 @@ void RegexASTMultiplication::add_to_nfa( nfa->set_root(end_state); m_operand->add_to_nfa_with_negative_tags(nfa, end_state); } else if (m_max > m_min) { - if (0 != m_min) { - NfaStateType* intermediate_state = nfa->new_state(); + if (m_min != 0) { + TypedNfaState* intermediate_state = nfa->new_state(); m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } for (uint32_t i = m_min + 1; i < m_max; ++i) { m_operand->add_to_nfa_with_negative_tags(nfa, end_state); - NfaStateType* intermediate_state = nfa->new_state(); + TypedNfaState* intermediate_state = nfa->new_state(); m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } @@ -875,8 +877,8 @@ void RegexASTMultiplication::add_to_nfa( nfa->set_root(saved_root); } -template -[[nodiscard]] auto RegexASTMultiplication::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTMultiplication::serialize() const -> std::u32string { auto const min_string = std::to_string(m_min); auto const max_string = std::to_string(m_max); @@ -885,12 +887,12 @@ template nullptr != m_operand ? m_operand->serialize() : U"null", std::u32string(min_string.begin(), min_string.end()), is_infinite() ? U"inf" : std::u32string(max_string.begin(), max_string.end()), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -auto RegexASTCapture::add_to_nfa(Nfa* nfa, NfaStateType* dest_state) +template +auto RegexASTCapture::add_to_nfa(Nfa* nfa, TypedNfaState* dest_state) const -> void { // TODO: move this into a documentation file in the future, and reference it here. // The NFA constructed for a capture group follows the structure below, with tagged transitions @@ -935,21 +937,21 @@ auto RegexASTCapture::add_to_nfa(Nfa* nfa, NfaStateT nfa->set_root(initial_root); } -template -[[nodiscard]] auto RegexASTCapture::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTCapture::serialize() const -> std::u32string { auto const tag_name_u32 = std::u32string(m_tag->get_name().cbegin(), m_tag->get_name().cend()); return fmt::format( U"({})<{}>{}", m_group_regex_ast->serialize(), tag_name_u32, - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTGroup::RegexASTGroup( +template +RegexASTGroup::RegexASTGroup( RegexASTGroup const* left, - RegexASTLiteral const* right + RegexASTLiteral const* right ) { if (right == nullptr) { throw std::runtime_error("RegexASTGroup1: right == nullptr: A bracket expression in the " @@ -961,16 +963,16 @@ RegexASTGroup::RegexASTGroup( m_ranges.emplace_back(right->get_character(), right->get_character()); } -template -RegexASTGroup::RegexASTGroup(RegexASTGroup const* left, RegexASTGroup const* right) +template +RegexASTGroup::RegexASTGroup(RegexASTGroup const* left, RegexASTGroup const* right) : m_negate(left->m_negate), m_ranges(left->m_ranges) { assert(right->m_ranges.size() == 1); // Only add LiteralRange m_ranges.push_back(right->m_ranges[0]); } -template -RegexASTGroup::RegexASTGroup(RegexASTLiteral const* right) { +template +RegexASTGroup::RegexASTGroup(RegexASTLiteral const* right) { if (right == nullptr) { throw std::runtime_error("RegexASTGroup2: right == nullptr: A bracket expression in the " "schema contains illegal characters, remember to escape special " @@ -980,16 +982,16 @@ RegexASTGroup::RegexASTGroup(RegexASTLiteral const* m_ranges.emplace_back(right->get_character(), right->get_character()); } -template -RegexASTGroup::RegexASTGroup(RegexASTGroup const* right) : m_negate(false) { +template +RegexASTGroup::RegexASTGroup(RegexASTGroup const* right) : m_negate(false) { assert(right->m_ranges.size() == 1); // Only add LiteralRange m_ranges.push_back(right->m_ranges[0]); } -template -RegexASTGroup::RegexASTGroup( - RegexASTLiteral const* left, - RegexASTLiteral const* right +template +RegexASTGroup::RegexASTGroup( + RegexASTLiteral const* left, + RegexASTLiteral const* right ) { if (left == nullptr || right == nullptr) { throw std::runtime_error( @@ -1003,22 +1005,22 @@ RegexASTGroup::RegexASTGroup( m_ranges.emplace_back(left->get_character(), right->get_character()); } -template -RegexASTGroup::RegexASTGroup(std::vector const& literals) +template +RegexASTGroup::RegexASTGroup(std::vector const& literals) : m_negate(false) { for (uint32_t literal : literals) { m_ranges.emplace_back(literal, literal); } } -template -RegexASTGroup::RegexASTGroup(uint32_t min, uint32_t max) : m_negate(false) { +template +RegexASTGroup::RegexASTGroup(uint32_t min, uint32_t max) : m_negate(false) { m_ranges.emplace_back(min, max); } // ranges must be sorted -template -auto RegexASTGroup::merge(std::vector const& ranges) -> std::vector { +template +auto RegexASTGroup::merge(std::vector const& ranges) -> std::vector { std::vector merged_ranges; if (ranges.empty()) { return merged_ranges; @@ -1038,8 +1040,8 @@ auto RegexASTGroup::merge(std::vector const& ranges) -> std } // ranges must be sorted and non-overlapping -template -auto RegexASTGroup::complement(std::vector const& ranges +template +auto RegexASTGroup::complement(std::vector const& ranges ) -> std::vector { std::vector complemented; uint32_t low = 0; @@ -1055,8 +1057,8 @@ auto RegexASTGroup::complement(std::vector const& ranges return complemented; } -template -void RegexASTGroup::add_to_nfa(Nfa* nfa, NfaStateType* end_state) +template +void RegexASTGroup::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { // TODO: there should be a better way to do this with a set and keep m_ranges sorted, but we // have to consider removing overlap + taking the compliment. @@ -1071,8 +1073,8 @@ void RegexASTGroup::add_to_nfa(Nfa* nfa, NfaStateTyp } } -template -[[nodiscard]] auto RegexASTGroup::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTGroup::serialize() const -> std::u32string { std::u32string ranges_serialized; if (m_is_wildcard) { ranges_serialized += U"*"; @@ -1098,7 +1100,7 @@ template U"[{}{}]{}", m_negate ? U"^" : U"", ranges_serialized, - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } } // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/RegexDFAStateType.hpp b/src/log_surgeon/finite_automata/RegexDFAStateType.hpp deleted file mode 100644 index ae4e52d4..00000000 --- a/src/log_surgeon/finite_automata/RegexDFAStateType.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE -#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE - -#include - -namespace log_surgeon::finite_automata { -enum class RegexDFAStateType : uint8_t { - Byte, - UTF8 -}; -} // namespace log_surgeon::finite_automata - -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 4da9b5fa..43315b2a 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -1,6 +1,7 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION #define LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION +#include #include #include #include @@ -15,9 +16,9 @@ namespace log_surgeon::finite_automata { /** * Represents an NFA transition indicating that a capture group has been matched. * NOTE: `m_tag` is always expected to be non-null. - * @tparam NfaStateType Specifies the type of transition (bytes or UTF-8 characters). + * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ -template +template class PositiveTaggedTransition { public: /** @@ -25,18 +26,18 @@ class PositiveTaggedTransition { * @param dest_state * @throw std::invalid_argument if `tag` is `nullptr`. */ - PositiveTaggedTransition(Tag const* tag, NfaStateType const* dest_state) + PositiveTaggedTransition(Tag const* tag, TypedNfaState const* dest_state) : m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : tag}, m_dest_state{dest_state} {} - [[nodiscard]] auto get_dest_state() const -> NfaStateType const* { return m_dest_state; } + [[nodiscard]] auto get_dest_state() const -> TypedNfaState const* { return m_dest_state; } /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the positive tagged transition on success. * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ - [[nodiscard]] auto serialize(std::unordered_map const& state_ids + [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional { auto const state_id_it = state_ids.find(m_dest_state); if (state_id_it == state_ids.end()) { @@ -47,15 +48,15 @@ class PositiveTaggedTransition { private: Tag const* m_tag; - NfaStateType const* m_dest_state; + TypedNfaState const* m_dest_state; }; /** * Represents an NFA transition indicating that a capture group has been unmatched. * NOTE: All tags in `m_tags` are always expected to be non-null. - * @tparam NfaStateType Specifies the type of transition (bytes or UTF-8 characters). + * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ -template +template class NegativeTaggedTransition { public: /** @@ -63,7 +64,7 @@ class NegativeTaggedTransition { * @param dest_state * @throw std::invalid_argument if any elements in `tags` is `nullptr`. */ - NegativeTaggedTransition(std::vector tags, NfaStateType const* dest_state) + NegativeTaggedTransition(std::vector tags, TypedNfaState const* dest_state) : m_tags{[&tags] { if (std::ranges::any_of(tags, [](Tag const* tag) { return nullptr == tag; })) { throw std::invalid_argument("Tags cannot contain null elements"); @@ -72,14 +73,14 @@ class NegativeTaggedTransition { }()}, m_dest_state{dest_state} {} - [[nodiscard]] auto get_dest_state() const -> NfaStateType const* { return m_dest_state; } + [[nodiscard]] auto get_dest_state() const -> TypedNfaState const* { return m_dest_state; } /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the negative tagged transition on success. * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ - [[nodiscard]] auto serialize(std::unordered_map const& state_ids + [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional { auto const state_id_it = state_ids.find(m_dest_state); if (state_id_it == state_ids.end()) { @@ -93,7 +94,7 @@ class NegativeTaggedTransition { private: std::vector m_tags; - NfaStateType const* m_dest_state; + TypedNfaState const* m_dest_state; }; } // namespace log_surgeon::finite_automata diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1e4a8363..0551615b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,9 +11,9 @@ set( ../src/log_surgeon/finite_automata/RegisterHandler.hpp ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp - ../src/log_surgeon/LALR1Parser.cpp - ../src/log_surgeon/LALR1Parser.hpp - ../src/log_surgeon/LALR1Parser.tpp + ../src/log_surgeon/Lalr1Parser.cpp + ../src/log_surgeon/Lalr1Parser.hpp + ../src/log_surgeon/Lalr1Parser.tpp ../src/log_surgeon/ParserInputBuffer.hpp ../src/log_surgeon/ParserInputBuffer.cpp ../src/log_surgeon/Schema.hpp diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 834e7fec..160d421a 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -13,22 +13,22 @@ #include using log_surgeon::cSizeOfByte; -using log_surgeon::finite_automata::NfaByteState; +using log_surgeon::finite_automata::ByteNfaState; using log_surgeon::Schema; using log_surgeon::SchemaVarAST; using std::string; using std::stringstream; using std::vector; -using ByteLexicalRule = log_surgeon::LexicalRule; -using ByteNFA = log_surgeon::finite_automata::Nfa; -using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat; -using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture; -using RegexASTGroupByte = log_surgeon::finite_automata::RegexASTGroup; -using RegexASTLiteralByte = log_surgeon::finite_automata::RegexASTLiteral; +using ByteLexicalRule = log_surgeon::LexicalRule; +using ByteNFA = log_surgeon::finite_automata::Nfa; +using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat; +using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture; +using RegexASTGroupByte = log_surgeon::finite_automata::RegexASTGroup; +using RegexASTLiteralByte = log_surgeon::finite_automata::RegexASTLiteral; using RegexASTMultiplicationByte - = log_surgeon::finite_automata::RegexASTMultiplication; -using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr; + = log_surgeon::finite_automata::RegexASTMultiplication; +using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr; TEST_CASE("Test NFA", "[NFA]") { Schema schema; diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index a3ab69fe..48b2185c 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -19,17 +19,17 @@ using std::vector; using std::wstring_convert; using RegexASTCatByte - = log_surgeon::finite_automata::RegexASTCat; + = log_surgeon::finite_automata::RegexASTCat; using RegexASTCaptureByte - = log_surgeon::finite_automata::RegexASTCapture; + = log_surgeon::finite_automata::RegexASTCapture; using RegexASTGroupByte - = log_surgeon::finite_automata::RegexASTGroup; + = log_surgeon::finite_automata::RegexASTGroup; using RegexASTLiteralByte - = log_surgeon::finite_automata::RegexASTLiteral; + = log_surgeon::finite_automata::RegexASTLiteral; using RegexASTMultiplicationByte = log_surgeon::finite_automata::RegexASTMultiplication< - log_surgeon::finite_automata::NfaByteState>; + log_surgeon::finite_automata::ByteNfaState>; using RegexASTOrByte - = log_surgeon::finite_automata::RegexASTOr; + = log_surgeon::finite_automata::RegexASTOr; using log_surgeon::SchemaVarAST; namespace { diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 27c79882..66d8f8a0 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -10,22 +10,11 @@ using log_surgeon::finite_automata::PrefixTree; using id_t = PrefixTree::id_t; using position_t = PrefixTree::position_t; -constexpr auto cRootId{PrefixTree::cRootId}; -constexpr id_t cInvalidNodeId{100}; -constexpr position_t cInsertPos1{4}; -constexpr position_t cInsertPos2{7}; -constexpr position_t cInsertPos3{9}; -constexpr position_t cMaxPos{std::numeric_limits::max()}; -constexpr position_t cNegativePos1{-1}; -constexpr position_t cNegativePos2{-100}; -constexpr position_t cSetPos1{10}; -constexpr position_t cSetPos2{12}; -constexpr position_t cSetPos3{15}; -constexpr position_t cSetPos4{20}; -constexpr position_t cTreeSize1{4}; -constexpr position_t cTreeSize2{8}; - TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { + constexpr auto cRootId{PrefixTree::cRootId}; + constexpr position_t cInitialPos1{4}; + constexpr position_t cSetPos1{10}; + SECTION("Newly constructed tree works correctly") { PrefixTree const tree; @@ -34,16 +23,24 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Inserting nodes into the prefix tree works correctly") { + constexpr position_t cInitialPos2{7}; + constexpr position_t cInitialPos3{9}; + constexpr position_t cMaxPos{std::numeric_limits::max()}; + constexpr position_t cNegativePos1{-1}; + constexpr position_t cNegativePos2{-100}; + constexpr position_t cTreeSize1{4}; + constexpr position_t cTreeSize2{8}; + PrefixTree tree; // Test basic insertions - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - auto const node_id_2{tree.insert(node_id_1, cInsertPos2)}; - auto const node_id_3{tree.insert(node_id_2, cInsertPos3)}; - REQUIRE(std::vector{cInsertPos1} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{cInsertPos2, cInsertPos1} + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos2)}; + auto const node_id_3{tree.insert(node_id_2, cInitialPos3)}; + REQUIRE(std::vector{cInitialPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cInitialPos2, cInitialPos1} == tree.get_reversed_positions(node_id_2)); - REQUIRE(std::vector{cInsertPos3, cInsertPos2, cInsertPos1} + REQUIRE(std::vector{cInitialPos3, cInitialPos2, cInitialPos1} == tree.get_reversed_positions(node_id_3)); REQUIRE(cTreeSize1 == tree.size()); @@ -53,12 +50,12 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { // Test insertion with negative position values auto const node_id_5{tree.insert(cRootId, cNegativePos1)}; - auto const node_id_6{tree.insert(node_id_5, cInsertPos1)}; + auto const node_id_6{tree.insert(node_id_5, cInitialPos1)}; auto const node_id_7{tree.insert(node_id_6, cNegativePos2)}; REQUIRE(std::vector{cNegativePos1} == tree.get_reversed_positions(node_id_5)); - REQUIRE(std::vector{cInsertPos1, cNegativePos1} + REQUIRE(std::vector{cInitialPos1, cNegativePos1} == tree.get_reversed_positions(node_id_6)); - REQUIRE(std::vector{cNegativePos2, cInsertPos1, cNegativePos1} + REQUIRE(std::vector{cNegativePos2, cInitialPos1, cNegativePos1} == tree.get_reversed_positions(node_id_7)); REQUIRE(cTreeSize2 == tree.size()); } @@ -67,7 +64,7 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { PrefixTree tree; REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); - tree.insert(cRootId, cInsertPos1); + tree.insert(cRootId, cInitialPos1); REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); REQUIRE_THROWS_AS( @@ -77,13 +74,17 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Set position for a valid index works correctly") { + constexpr position_t cSetPos2{12}; + constexpr position_t cSetPos3{15}; + constexpr position_t cSetPos4{20}; + PrefixTree tree; // Test that you can set the root node for sanity, although this value is not used tree.set(cRootId, cSetPos1); // Test updates to different nodes - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - auto const node_id_2{tree.insert(node_id_1, cInsertPos1)}; + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos1)}; tree.set(node_id_1, cSetPos1); tree.set(node_id_2, cSetPos2); REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); @@ -105,13 +106,15 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Set position for an invalid index throws correctly") { + constexpr id_t cInvalidNodeId{100}; + PrefixTree tree; // Test setting position before any insertions - REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos4), std::out_of_range); + REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos1), std::out_of_range); // Test setting position just beyond valid range - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos4), std::out_of_range); + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos1), std::out_of_range); } } diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 2371fc9e..e8102e22 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -12,17 +12,19 @@ using position_t = log_surgeon::finite_automata::PrefixTree::position_t; namespace { /** - * @param handler The register handler that will contain the new registers. - * @param num_registers The number of registers to initialize. + * @param num_registers The number of registers managed by the handler. + * @return The newly initialized register handler. */ -auto registers_init(RegisterHandler& handler, size_t num_registers) -> void; +[[nodiscard]] auto handler_init(size_t num_registers) -> RegisterHandler; -auto registers_init(RegisterHandler& handler, size_t const num_registers) -> void { +auto handler_init(size_t const num_registers) -> RegisterHandler { constexpr position_t cDefaultPos{0}; + RegisterHandler handler; for (size_t i{0}; i < num_registers; ++i) { handler.add_register(i, cDefaultPos); } + return handler; } } // namespace @@ -32,13 +34,12 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { constexpr size_t cRegId1{0}; constexpr size_t cRegId2{1}; - RegisterHandler handler; - SECTION("Initial state is empty") { - REQUIRE_THROWS_AS(handler.get_reversed_positions(cRegId1), std::out_of_range); + RegisterHandler empty_handler{handler_init(0)}; + REQUIRE_THROWS_AS(empty_handler.get_reversed_positions(cRegId1), std::out_of_range); } - registers_init(handler, cNumRegisters); + RegisterHandler handler{handler_init(cNumRegisters)}; SECTION("Set register position correctly") { handler.set_register(cRegId1, cInitialPos1);