Bug-Fix: Add negative tags for RegexMultiplicationAST with min=0;…

… Update README to include `intersection-test`. (#41) Co-authored-by: Lin Zhihao <[email protected]>
y-scope · Oct 7, 2024 · bb06e57 · bb06e57
1 parent a092206
commit bb06e57
Show file tree

Hide file tree

Showing 8 changed files with 194 additions and 55 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -1,9 +1,12 @@
 # Examples
 
-There are two example programs in this directory, `buffer-parser` and
+The first two example programs in this directory are `buffer-parser` and
 `reader-parser` corresponding to the [two API styles][1]. They demonstrate
 parsing a log file and printing out the timestamp and log-level of each message,
-as well as any multiline log messages. 
+as well as any multiline log messages.
+
+The third example is `intersect-test` which demonstrates the result of taking
+the intersection between a schema DFA and a search query DFA.
 
 ## Building
 
@@ -24,6 +27,7 @@ The example programs can be run as follows:
 ```shell
 ./examples/build/buffer-parser ./examples/schema.txt log.txt
 ./examples/build/reader-parser ./examples/schema.txt log.txt
+./examples/build/intersect-test
 ```
 
 where:

diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp
@@ -29,7 +29,7 @@ auto get_intersect_for_query(
         processed_search_string.push_back(c);
     }
     log_surgeon::Schema schema;
-    schema.add_variable("search", processed_search_string, -1);
+    schema.add_variable(string("search:") + processed_search_string, -1);
     RegexNFA<RegexNFAByteState> nfa;
     auto schema_ast = schema.release_schema_ast_ptr();
     for (unique_ptr<ParserAST> const& parser_ast : schema_ast->m_schema_vars) {
@@ -51,20 +51,20 @@ auto main() -> int {
         log_surgeon::Schema schema;
         if (0 == i) {
             std::cout << "--Schema1--" << std::endl;
-            schema.add_variable("int", "\\-{0,1}[0-9]+", -1);
-            schema.add_variable("float", "\\-{0,1}[0-9]+\\.[0-9]+", -1);
-            schema.add_variable("hex", "[a-fA-F]+", -1);
-            schema.add_variable("hasNumber", ".*\\d.*", -1);
-            schema.add_variable("equals", ".*=.*[a-zA-Z0-9].*", -1);
-            schema.add_variable("logLevel", "(INFO)|(DEBUG)|(WARN)|(ERROR)|(TRACE)|(FATAL)", -1);
+            schema.add_variable("int:\\-{0,1}[0-9]+", -1);
+            schema.add_variable("float:\\-{0,1}[0-9]+\\.[0-9]+", -1);
+            schema.add_variable("hex:[a-fA-F]+", -1);
+            schema.add_variable("hasNumber:.*\\d.*", -1);
+            schema.add_variable("equals:.*=.*[a-zA-Z0-9].*", -1);
+            schema.add_variable("logLevel:(INFO)|(DEBUG)|(WARN)|(ERROR)|(TRACE)|(FATAL)", -1);
         } else {
             std::cout << "--Schema2--" << std::endl;
-            schema.add_variable("v1", "1", -1);
-            schema.add_variable("v2", "2", -1);
-            schema.add_variable("v3", "3", -1);
-            schema.add_variable("v4", "abc12", -1);
-            schema.add_variable("v5", "23def", -1);
-            schema.add_variable("v6", "123", -1);
+            schema.add_variable("v1:1", -1);
+            schema.add_variable("v2:2", -1);
+            schema.add_variable("v3:3", -1);
+            schema.add_variable("v4:abc12", -1);
+            schema.add_variable("v5:23def", -1);
+            schema.add_variable("v6:123", -1);
         }
         std::map<uint32_t, std::string> m_id_symbol;
         RegexNFA<RegexNFAByteState> nfa;

diff --git a/src/log_surgeon/Schema.cpp b/src/log_surgeon/Schema.cpp
@@ -10,10 +10,8 @@ Schema::Schema() {
 Schema::Schema(std::string const& schema_file_path)
         : m_schema_ast{SchemaParser::try_schema_file(schema_file_path)} {}
 
-auto Schema::add_variable(std::string const& var_name, std::string const& regex, int priority)
-        -> void {
-    std::string unparsed_string = var_name + ":" + regex;
-    std::unique_ptr<SchemaAST> schema_ast = SchemaParser::try_schema_string(unparsed_string);
+auto Schema::add_variable(std::string_view const var_schema, int const priority) const -> void {
+    std::unique_ptr<SchemaAST> const schema_ast = SchemaParser::try_schema_string(var_schema);
     m_schema_ast->add_schema_var(std::move(schema_ast->m_schema_vars[0]), priority);
 }
 }  // namespace log_surgeon
diff --git a/src/log_surgeon/Schema.hpp b/src/log_surgeon/Schema.hpp
@@ -19,15 +19,14 @@ class Schema {
     explicit Schema(std::string const& schema_file_path);
 
     /**
-     * Parses var_name+":"+regex as if it were its own entire schema file. Then
-     * extracts the SchemaVarAST from the resulting SchemaAST and adds it to
-     * m_schema_vars in m_schema_ast. Position in m_schema_vars is determined by
-     * the priority (priority == -1 to set to lowest).
-     * @param var_name
-     * @param regex
+     * Parses `var_schema` as if it were its own entire schema file. Then extracts the
+     * `SchemaVarAST` from the resulting `SchemaAST` and adds it to `m_schema_vars` in
+     * `m_schema_ast`. Position in `m_schema_vars` is determined by the `priority` (`priority` == -1
+     * to set to lowest).
+     * @param var_schema
      * @param priority
      */
-    auto add_variable(std::string const& var_name, std::string const& regex, int priority) -> void;
+    auto add_variable(std::string_view var_schema, int priority) const -> void;
 
     /* Work in progress API to modify a schema object
 

diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp
@@ -4,6 +4,8 @@
 #include <memory>
 #include <span>
 #include <stdexcept>
+#include <string>
+#include <string_view>
 
 #include <log_surgeon/Constants.hpp>
 #include <log_surgeon/FileReader.hpp>
@@ -30,9 +32,12 @@ using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat<
         log_surgeon::finite_automata::RegexNFAByteState>;
 using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture<
         log_surgeon::finite_automata::RegexNFAByteState>;
+using RegexASTEmptyByte = log_surgeon::finite_automata::RegexASTEmpty<
+        log_surgeon::finite_automata::RegexNFAByteState>;
 
 using std::make_unique;
 using std::string;
+using std::string_view;
 using std::unique_ptr;
 
 namespace log_surgeon {
@@ -77,7 +82,7 @@ auto SchemaParser::try_schema_file(string const& schema_file_path) -> unique_ptr
     return schema_ast;
 }
 
-auto SchemaParser::try_schema_string(string const& schema_string) -> unique_ptr<SchemaAST> {
+auto SchemaParser::try_schema_string(string_view const schema_string) -> unique_ptr<SchemaAST> {
     Reader reader{[&](char* dst_buf, size_t count, size_t& read_to) -> ErrorCode {
         uint32_t unparsed_string_pos = 0;
         std::span<char> const buf{dst_buf, count};
@@ -196,8 +201,11 @@ static auto regex_or_rule(NonTerminal* m) -> unique_ptr<ParserAST> {
 
 static auto regex_match_zero_or_more_rule(NonTerminal* m) -> unique_ptr<ParserAST> {
     auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get<unique_ptr<RegexASTByte>>();
-    return unique_ptr<ParserAST>(new ParserValueRegex(
-            unique_ptr<RegexASTByte>(new RegexASTMultiplicationByte(std::move(r1), 0, 0))
+
+    // To handle negative tags we treat `R*` as `R+ | ∅`.
+    return make_unique<ParserValueRegex>(make_unique<RegexASTOrByte>(
+            make_unique<RegexASTEmptyByte>(),
+            make_unique<RegexASTMultiplicationByte>(std::move(r1), 1, 0)
     ));
 }
 
@@ -238,6 +246,15 @@ static auto regex_match_range_rule(NonTerminal* m) -> unique_ptr<ParserAST> {
         max += r5_ptr->get_digit(i) * (uint32_t)pow(10, r5_size - i - 1);
     }
     auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get<unique_ptr<RegexASTByte>>();
+
+    if (0 == min) {
+        // To handle negative tags we treat `R*` as `R+ | ∅`.
+        return make_unique<ParserValueRegex>(make_unique<RegexASTOrByte>(
+                make_unique<RegexASTEmptyByte>(),
+                make_unique<RegexASTMultiplicationByte>(std::move(r1), 1, max)
+        ));
+    }
+
     return unique_ptr<ParserAST>(new ParserValueRegex(
             unique_ptr<RegexASTByte>(new RegexASTMultiplicationByte(std::move(r1), min, max))
     ));

diff --git a/src/log_surgeon/SchemaParser.hpp b/src/log_surgeon/SchemaParser.hpp
@@ -1,6 +1,8 @@
 #ifndef LOG_SURGEON_SCHEMA_PARSER_HPP
 #define LOG_SURGEON_SCHEMA_PARSER_HPP
 
+#include <string>
+#include <string_view>
 #include <utility>
 
 #include <log_surgeon/LALR1Parser.hpp>
@@ -94,7 +96,7 @@ class SchemaParser : public LALR1Parser<
      * @param schema_string
      * @return std::unique_ptr<SchemaAST>
      */
-    static auto try_schema_string(std::string const& schema_string) -> std::unique_ptr<SchemaAST>;
+    static auto try_schema_string(std::string_view schema_string) -> std::unique_ptr<SchemaAST>;
 
     static auto get_special_regex_characters() -> std::unordered_map<char, std::string> const& {
         return m_special_regex_characters;

diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp
@@ -124,6 +124,43 @@ class RegexAST {
     std::set<uint32_t> m_negative_tags;
 };
 
+/**
+ * Class for an empty AST node. This is used to simplify tagged-NFA creation when using regex
+ * repetition with a minimum repetition of 0. Namely, we treat `R{0,N}` as `R{1,N} | ∅`. Then, the
+ * NFA handles the 0 repetition case using the logic in `RegexASTOR` (i.e., adding a negative
+ * transition for every capture group matched in `R{1,N}`).
+ * @tparam NFAStateType Whether this AST is used for byte lexing or UTF-8 lexing.
+ */
+template <typename NFAStateType>
+class RegexASTEmpty : public RegexAST<NFAStateType> {
+public:
+    RegexASTEmpty() = default;
+
+    [[nodiscard]] auto clone() const -> gsl::owner<RegexASTEmpty*> override {
+        return new RegexASTEmpty(*this);
+    }
+
+    auto set_possible_inputs_to_true(
+            [[maybe_unused]] std::array<bool, cSizeOfUnicode>& is_possible_input
+    ) const -> void override {
+        // Do nothing as an empty node contains no utf8 characters.
+    }
+
+    auto remove_delimiters_from_wildcard([[maybe_unused]] std::vector<uint32_t>& delimiters
+    ) -> void override {
+        // Do nothing as an empty node contains no delimiters.
+    }
+
+    auto add_to_nfa(
+            [[maybe_unused]] RegexNFA<NFAStateType>* nfa,
+            [[maybe_unused]] NFAStateType* end_state
+    ) const -> void override {
+        // Do nothing as adding an empty node to the NFA is a null operation.
+    }
+
+    [[nodiscard]] auto serialize() const -> std::u32string override;
+};
+
 template <typename NFAStateType>
 class RegexASTLiteral : public RegexAST<NFAStateType> {
 public:
@@ -233,7 +270,7 @@ class RegexASTGroup : public RegexAST<NFAStateType> {
 public:
     using Range = std::pair<uint32_t, uint32_t>;
 
-    RegexASTGroup();
+    RegexASTGroup() = default;
 
     explicit RegexASTGroup(RegexASTLiteral<NFAStateType> const* right);
 
@@ -655,6 +692,11 @@ class RegexASTCapture : public RegexAST<NFAStateType> {
     uint32_t m_tag;
 };
 
+template <typename NFAStateType>
+[[nodiscard]] auto RegexASTEmpty<NFAStateType>::serialize() const -> std::u32string {
+    return fmt::format(U"{}", RegexAST<NFAStateType>::serialize_negative_tags());
+}
+
 template <typename NFAStateType>
 RegexASTLiteral<NFAStateType>::RegexASTLiteral(uint32_t character) : m_character(character) {}
 
@@ -820,7 +862,7 @@ template <typename NFAStateType>
     auto const max_string = std::to_string(m_max);
 
     return fmt::format(
-            U"{}{{{},{}}}{}",
+            U"({}){{{},{}}}{}",
             nullptr != m_operand ? m_operand->serialize() : U"null",
             std::u32string(min_string.begin(), min_string.end()),
             is_infinite() ? U"inf" : std::u32string(max_string.begin(), max_string.end()),
@@ -844,9 +886,6 @@ template <typename NFAStateType>
     );
 }
 
-template <typename NFAStateType>
-RegexASTGroup<NFAStateType>::RegexASTGroup() = default;
-
 template <typename NFAStateType>
 RegexASTGroup<NFAStateType>::RegexASTGroup(
         RegexASTGroup const* left,