Skip to content

Commit

Permalink
Bug-Fix: Add negative tags for RegexMultiplicationAST with min=0;…
Browse files Browse the repository at this point in the history
… Update README to include `intersection-test`. (#41)

Co-authored-by: Lin Zhihao <[email protected]>
  • Loading branch information
SharafMohamed and LinZhihao-723 authored Oct 7, 2024
1 parent a092206 commit bb06e57
Show file tree
Hide file tree
Showing 8 changed files with 194 additions and 55 deletions.
8 changes: 6 additions & 2 deletions examples/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# Examples

There are two example programs in this directory, `buffer-parser` and
The first two example programs in this directory are `buffer-parser` and
`reader-parser` corresponding to the [two API styles][1]. They demonstrate
parsing a log file and printing out the timestamp and log-level of each message,
as well as any multiline log messages.
as well as any multiline log messages.

The third example is `intersect-test` which demonstrates the result of taking
the intersection between a schema DFA and a search query DFA.

## Building

Expand All @@ -24,6 +27,7 @@ The example programs can be run as follows:
```shell
./examples/build/buffer-parser ./examples/schema.txt log.txt
./examples/build/reader-parser ./examples/schema.txt log.txt
./examples/build/intersect-test
```

where:
Expand Down
26 changes: 13 additions & 13 deletions examples/intersect-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ auto get_intersect_for_query(
processed_search_string.push_back(c);
}
log_surgeon::Schema schema;
schema.add_variable("search", processed_search_string, -1);
schema.add_variable(string("search:") + processed_search_string, -1);
RegexNFA<RegexNFAByteState> nfa;
auto schema_ast = schema.release_schema_ast_ptr();
for (unique_ptr<ParserAST> const& parser_ast : schema_ast->m_schema_vars) {
Expand All @@ -51,20 +51,20 @@ auto main() -> int {
log_surgeon::Schema schema;
if (0 == i) {
std::cout << "--Schema1--" << std::endl;
schema.add_variable("int", "\\-{0,1}[0-9]+", -1);
schema.add_variable("float", "\\-{0,1}[0-9]+\\.[0-9]+", -1);
schema.add_variable("hex", "[a-fA-F]+", -1);
schema.add_variable("hasNumber", ".*\\d.*", -1);
schema.add_variable("equals", ".*=.*[a-zA-Z0-9].*", -1);
schema.add_variable("logLevel", "(INFO)|(DEBUG)|(WARN)|(ERROR)|(TRACE)|(FATAL)", -1);
schema.add_variable("int:\\-{0,1}[0-9]+", -1);
schema.add_variable("float:\\-{0,1}[0-9]+\\.[0-9]+", -1);
schema.add_variable("hex:[a-fA-F]+", -1);
schema.add_variable("hasNumber:.*\\d.*", -1);
schema.add_variable("equals:.*=.*[a-zA-Z0-9].*", -1);
schema.add_variable("logLevel:(INFO)|(DEBUG)|(WARN)|(ERROR)|(TRACE)|(FATAL)", -1);
} else {
std::cout << "--Schema2--" << std::endl;
schema.add_variable("v1", "1", -1);
schema.add_variable("v2", "2", -1);
schema.add_variable("v3", "3", -1);
schema.add_variable("v4", "abc12", -1);
schema.add_variable("v5", "23def", -1);
schema.add_variable("v6", "123", -1);
schema.add_variable("v1:1", -1);
schema.add_variable("v2:2", -1);
schema.add_variable("v3:3", -1);
schema.add_variable("v4:abc12", -1);
schema.add_variable("v5:23def", -1);
schema.add_variable("v6:123", -1);
}
std::map<uint32_t, std::string> m_id_symbol;
RegexNFA<RegexNFAByteState> nfa;
Expand Down
6 changes: 2 additions & 4 deletions src/log_surgeon/Schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,8 @@ Schema::Schema() {
Schema::Schema(std::string const& schema_file_path)
: m_schema_ast{SchemaParser::try_schema_file(schema_file_path)} {}

auto Schema::add_variable(std::string const& var_name, std::string const& regex, int priority)
-> void {
std::string unparsed_string = var_name + ":" + regex;
std::unique_ptr<SchemaAST> schema_ast = SchemaParser::try_schema_string(unparsed_string);
auto Schema::add_variable(std::string_view const var_schema, int const priority) const -> void {
std::unique_ptr<SchemaAST> const schema_ast = SchemaParser::try_schema_string(var_schema);
m_schema_ast->add_schema_var(std::move(schema_ast->m_schema_vars[0]), priority);
}
} // namespace log_surgeon
13 changes: 6 additions & 7 deletions src/log_surgeon/Schema.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@ class Schema {
explicit Schema(std::string const& schema_file_path);

/**
* Parses var_name+":"+regex as if it were its own entire schema file. Then
* extracts the SchemaVarAST from the resulting SchemaAST and adds it to
* m_schema_vars in m_schema_ast. Position in m_schema_vars is determined by
* the priority (priority == -1 to set to lowest).
* @param var_name
* @param regex
* Parses `var_schema` as if it were its own entire schema file. Then extracts the
* `SchemaVarAST` from the resulting `SchemaAST` and adds it to `m_schema_vars` in
* `m_schema_ast`. Position in `m_schema_vars` is determined by the `priority` (`priority` == -1
* to set to lowest).
* @param var_schema
* @param priority
*/
auto add_variable(std::string const& var_name, std::string const& regex, int priority) -> void;
auto add_variable(std::string_view var_schema, int priority) const -> void;

/* Work in progress API to modify a schema object
Expand Down
23 changes: 20 additions & 3 deletions src/log_surgeon/SchemaParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <memory>
#include <span>
#include <stdexcept>
#include <string>
#include <string_view>

#include <log_surgeon/Constants.hpp>
#include <log_surgeon/FileReader.hpp>
Expand All @@ -30,9 +32,12 @@ using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat<
log_surgeon::finite_automata::RegexNFAByteState>;
using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture<
log_surgeon::finite_automata::RegexNFAByteState>;
using RegexASTEmptyByte = log_surgeon::finite_automata::RegexASTEmpty<
log_surgeon::finite_automata::RegexNFAByteState>;

using std::make_unique;
using std::string;
using std::string_view;
using std::unique_ptr;

namespace log_surgeon {
Expand Down Expand Up @@ -77,7 +82,7 @@ auto SchemaParser::try_schema_file(string const& schema_file_path) -> unique_ptr
return schema_ast;
}

auto SchemaParser::try_schema_string(string const& schema_string) -> unique_ptr<SchemaAST> {
auto SchemaParser::try_schema_string(string_view const schema_string) -> unique_ptr<SchemaAST> {
Reader reader{[&](char* dst_buf, size_t count, size_t& read_to) -> ErrorCode {
uint32_t unparsed_string_pos = 0;
std::span<char> const buf{dst_buf, count};
Expand Down Expand Up @@ -196,8 +201,11 @@ static auto regex_or_rule(NonTerminal* m) -> unique_ptr<ParserAST> {

static auto regex_match_zero_or_more_rule(NonTerminal* m) -> unique_ptr<ParserAST> {
auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get<unique_ptr<RegexASTByte>>();
return unique_ptr<ParserAST>(new ParserValueRegex(
unique_ptr<RegexASTByte>(new RegexASTMultiplicationByte(std::move(r1), 0, 0))

// To handle negative tags we treat `R*` as `R+ | ∅`.
return make_unique<ParserValueRegex>(make_unique<RegexASTOrByte>(
make_unique<RegexASTEmptyByte>(),
make_unique<RegexASTMultiplicationByte>(std::move(r1), 1, 0)
));
}

Expand Down Expand Up @@ -238,6 +246,15 @@ static auto regex_match_range_rule(NonTerminal* m) -> unique_ptr<ParserAST> {
max += r5_ptr->get_digit(i) * (uint32_t)pow(10, r5_size - i - 1);
}
auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get<unique_ptr<RegexASTByte>>();

if (0 == min) {
// To handle negative tags we treat `R*` as `R+ | ∅`.
return make_unique<ParserValueRegex>(make_unique<RegexASTOrByte>(
make_unique<RegexASTEmptyByte>(),
make_unique<RegexASTMultiplicationByte>(std::move(r1), 1, max)
));
}

return unique_ptr<ParserAST>(new ParserValueRegex(
unique_ptr<RegexASTByte>(new RegexASTMultiplicationByte(std::move(r1), min, max))
));
Expand Down
4 changes: 3 additions & 1 deletion src/log_surgeon/SchemaParser.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef LOG_SURGEON_SCHEMA_PARSER_HPP
#define LOG_SURGEON_SCHEMA_PARSER_HPP

#include <string>
#include <string_view>
#include <utility>

#include <log_surgeon/LALR1Parser.hpp>
Expand Down Expand Up @@ -94,7 +96,7 @@ class SchemaParser : public LALR1Parser<
* @param schema_string
* @return std::unique_ptr<SchemaAST>
*/
static auto try_schema_string(std::string const& schema_string) -> std::unique_ptr<SchemaAST>;
static auto try_schema_string(std::string_view schema_string) -> std::unique_ptr<SchemaAST>;

static auto get_special_regex_characters() -> std::unordered_map<char, std::string> const& {
return m_special_regex_characters;
Expand Down
49 changes: 44 additions & 5 deletions src/log_surgeon/finite_automata/RegexAST.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,43 @@ class RegexAST {
std::set<uint32_t> m_negative_tags;
};

/**
* Class for an empty AST node. This is used to simplify tagged-NFA creation when using regex
* repetition with a minimum repetition of 0. Namely, we treat `R{0,N}` as `R{1,N} | ∅`. Then, the
* NFA handles the 0 repetition case using the logic in `RegexASTOR` (i.e., adding a negative
* transition for every capture group matched in `R{1,N}`).
* @tparam NFAStateType Whether this AST is used for byte lexing or UTF-8 lexing.
*/
template <typename NFAStateType>
class RegexASTEmpty : public RegexAST<NFAStateType> {
public:
RegexASTEmpty() = default;

[[nodiscard]] auto clone() const -> gsl::owner<RegexASTEmpty*> override {
return new RegexASTEmpty(*this);
}

auto set_possible_inputs_to_true(
[[maybe_unused]] std::array<bool, cSizeOfUnicode>& is_possible_input
) const -> void override {
// Do nothing as an empty node contains no utf8 characters.
}

auto remove_delimiters_from_wildcard([[maybe_unused]] std::vector<uint32_t>& delimiters
) -> void override {
// Do nothing as an empty node contains no delimiters.
}

auto add_to_nfa(
[[maybe_unused]] RegexNFA<NFAStateType>* nfa,
[[maybe_unused]] NFAStateType* end_state
) const -> void override {
// Do nothing as adding an empty node to the NFA is a null operation.
}

[[nodiscard]] auto serialize() const -> std::u32string override;
};

template <typename NFAStateType>
class RegexASTLiteral : public RegexAST<NFAStateType> {
public:
Expand Down Expand Up @@ -233,7 +270,7 @@ class RegexASTGroup : public RegexAST<NFAStateType> {
public:
using Range = std::pair<uint32_t, uint32_t>;

RegexASTGroup();
RegexASTGroup() = default;

explicit RegexASTGroup(RegexASTLiteral<NFAStateType> const* right);

Expand Down Expand Up @@ -655,6 +692,11 @@ class RegexASTCapture : public RegexAST<NFAStateType> {
uint32_t m_tag;
};

template <typename NFAStateType>
[[nodiscard]] auto RegexASTEmpty<NFAStateType>::serialize() const -> std::u32string {
return fmt::format(U"{}", RegexAST<NFAStateType>::serialize_negative_tags());
}

template <typename NFAStateType>
RegexASTLiteral<NFAStateType>::RegexASTLiteral(uint32_t character) : m_character(character) {}

Expand Down Expand Up @@ -820,7 +862,7 @@ template <typename NFAStateType>
auto const max_string = std::to_string(m_max);

return fmt::format(
U"{}{{{},{}}}{}",
U"({}){{{},{}}}{}",
nullptr != m_operand ? m_operand->serialize() : U"null",
std::u32string(min_string.begin(), min_string.end()),
is_infinite() ? U"inf" : std::u32string(max_string.begin(), max_string.end()),
Expand All @@ -844,9 +886,6 @@ template <typename NFAStateType>
);
}

template <typename NFAStateType>
RegexASTGroup<NFAStateType>::RegexASTGroup() = default;

template <typename NFAStateType>
RegexASTGroup<NFAStateType>::RegexASTGroup(
RegexASTGroup const* left,
Expand Down
Loading

0 comments on commit bb06e57

Please sign in to comment.