y-scope · davidlion · Sep 22, 2023 · Sep 17, 2023 · Sep 17, 2023 · Sep 17, 2023
@@ -96,7 +96,7 @@ delimiters: \t\r\n:,!;%
 timestamp:\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}(\.\d{3}){0,1}
 timestamp:\[\d{8}\-\d{2}:\d{2}:\d{2}\]
 int:\-{0,1}[0-9]+
-double:\-{0,1}[0-9]+\.[0-9]+
+float:\-{0,1}[0-9]+\.[0-9]+
 
 // Custom variables
 hex:[a-fA-F]+
@@ -109,7 +109,7 @@ equals:.*=.*[a-zA-Z0-9].*
 * `timestamp` matches two different patterns:
     * 2023-04-19 12:32:08.064
     * [20230419-12:32:08]
-* `int`, `double`, `hex`, `hasNumber`, and `equals` all match different user
+* `int`, `float`, `hex`, `hasNumber`, and `equals` all match different user
   defined variables.
 
 ## Regular Expression Syntax

@@ -54,7 +54,7 @@ delimiters: \t\r\n:,!;%
 
 // First set of variables
 int:\-{0,1}[0-9]+
-double:\-{0,1}[0-9]+\.[0-9]+
+float:\-{0,1}[0-9]+\.[0-9]+
 
 // Second set of variables
 hex:[a-fA-F]+

@@ -48,6 +48,7 @@ namespace utf8 {
     // 0xFF are invalid UTF-8 code units
     static unsigned char const cCharEOF = 0xFF;
     static unsigned char const cCharErr = 0xFE;
+    static unsigned char const cCharSOF = 0xFD;
 }  // namespace utf8
 }  // namespace log_surgeon
 

@@ -83,6 +83,13 @@ class Lexer {
      */
     auto reset() -> void;
 
+    /**
+     * Set the lexer state as if it had already read a delimiter (used for
+     * treating start of file as a delimiter)
+     * @param input_buffer containing the data to be lexed
+     */
+    auto prepend_SOF(ParserInputBuffer& input_buffer) -> void;
+
     /**
      * Flip lexer states to match static buffer flipping.
      * @param old_storage_size The previous buffer size used to calculate the

@@ -329,6 +329,16 @@ void Lexer<NFAStateType, DFAStateType>::reset() {
     m_prev_state = nullptr;
 }
 
+template <typename NFAStateType, typename DFAStateType>
+void Lexer<NFAStateType, DFAStateType>::prepend_SOF(ParserInputBuffer& input_buffer) {
+    m_prev_state = m_dfa->get_root()->next(utf8::cCharSOF);
+    m_asked_for_more_data = true;
+    m_start_pos = input_buffer.storage().pos();
+    m_match_pos = input_buffer.storage().pos();
+    m_match_line = m_line;
+    m_type_ids = nullptr;
+}
+
 template <typename NFAStateType, typename DFAStateType>
 void Lexer<NFAStateType, DFAStateType>::add_delimiters(std::vector<uint32_t> const& delimiters) {
     assert(!delimiters.empty());
@@ -339,6 +349,7 @@ void Lexer<NFAStateType, DFAStateType>::add_delimiters(std::vector<uint32_t> con
     for (uint32_t delimiter : delimiters) {
         m_is_delimiter[delimiter] = true;
     }
+    m_is_delimiter[utf8::cCharSOF] = true;
 }
 
 template <typename NFAStateType, typename DFAStateType>

@@ -63,7 +63,15 @@ void LogParser::add_rules(SchemaAST const* schema_ast) {
             unique_ptr<RegexAST<RegexNFAByteState>> first_timestamp_regex_ast(
                     rule->m_regex_ptr->clone()
             );
-            add_rule("firstTimestamp", std::move(first_timestamp_regex_ast));
+            unique_ptr<RegexASTLiteral<RegexNFAByteState>> r1
+                    = make_unique<RegexASTLiteral<RegexNFAByteState>>(utf8::cCharSOF);
+            add_rule(
+                    "firstTimestamp",
+                    make_unique<RegexASTCat<RegexNFAByteState>>(
+                            std::move(r1),
+                            std::move(first_timestamp_regex_ast)
+                    )
+            );
             unique_ptr<RegexAST<RegexNFAByteState>> newline_timestamp_regex_ast(
                     rule->m_regex_ptr->clone()
             );
@@ -143,6 +151,7 @@ void LogParser::add_rules(SchemaAST const* schema_ast) {
 auto LogParser::reset() -> void {
     m_input_buffer.reset();
     m_lexer.reset();
+    m_lexer.prepend_SOF(m_input_buffer);
 }
 
 // TODO: if the first text is a variable in the no timestamp case you lose the