fluent · repeatedly · Sep 9, 2019 · Sep 3, 2019 · Sep 3, 2019 · Sep 4, 2019
diff --git a/lib/fluent/plugin/parser_syslog.rb b/lib/fluent/plugin/parser_syslog.rb
@@ -38,6 +38,10 @@ class SyslogParser < Parser
       config_param :message_format, :enum, list: [:rfc3164, :rfc5424, :auto], default: :rfc3164
       desc 'Specify time format for event time for rfc5424 protocol'
       config_param :rfc5424_time_format, :string, default: "%Y-%m-%dT%H:%M:%S.%L%z"
+      desc 'The parser type used to parse syslog message'
+      config_param :parser_type, :enum, list: [:regexp, :string], default: :regexp
+      desc 'support colonless ident in string parser'
+      config_param :support_colonless_ident, :bool, default: true
 
       def initialize
         super
@@ -50,10 +54,17 @@ def configure(conf)
         @time_parser_rfc3164 = @time_parser_rfc5424 = nil
         @time_parser_rfc5424_without_subseconds = nil
         @support_rfc5424_without_subseconds = false
+        @regexp_parser = @parser_type == :regexp
         @regexp = case @message_format
                   when :rfc3164
-                    class << self
-                      alias_method :parse, :parse_plain
+                    if @regexp_parser
+                      class << self
+                        alias_method :parse, :parse_plain
+                      end
+                    else
+                      class << self
+                        alias_method :parse, :parse_rfc3164
+                      end
                     end
                     @with_priority ? REGEXP_WITH_PRI : REGEXP
                   when :rfc5424
@@ -88,11 +99,16 @@ def parse_auto(text, &block)
           @regexp = @with_priority ? REGEXP_RFC5424_WITH_PRI : REGEXP_RFC5424
           @time_parser = @time_parser_rfc5424
           @support_rfc5424_without_subseconds = true
+          parse_plain(text, &block)
         else
           @regexp = @with_priority ? REGEXP_WITH_PRI : REGEXP
           @time_parser = @time_parser_rfc3164
+          if @regexp_parser
+            parse_plain(text, &block)
+          else
+            parse_rfc3164(text, &block)
+          end
         end
-        parse_plain(text, &block)
       end
 
       def parse_plain(text, &block)
@@ -137,6 +153,90 @@ def parse_plain(text, &block)
 
         yield time, record
       end
+
+      SPLIT_CHAR = ' '.freeze
+      PRI_START_CHAR = '<'.freeze
+
+      def parse_rfc3164(text, &block)
+        pri = nil
+        cursor = 0
+        if @with_priority
+          if text.start_with?(PRI_START_CHAR)
+            i = text.index('>'.freeze, 1)
+            pri = text.slice(1, i - 1).to_i
+            cursor = i + 1
+          else
+            yield nil, nil
+            return
+          end
+        end
+
+        # header part
+        time_diff = 15 # skip Mmm dd hh:mm:ss
+        time_end = text[cursor + time_diff]
+        if time_end == SPLIT_CHAR
+          time_str = text.slice(cursor, time_diff)
+          cursor += 16 # time + ' '
+        elsif time_end == '.'.freeze
+          # support subsecond time
+          i = text.index(SPLIT_CHAR, time_diff)
+          time_str = text.slice(cursor, i - cursor)
+          cursor = i + 1
+        else
+          yield nil, nil
+          return
+        end
+
+        i = text.index(SPLIT_CHAR, cursor)
+        if i.nil?
+          yield nil, nil
+          return
+        end
+        host_diff = i - cursor
+        host = text.slice(cursor, host_diff)
+        cursor += (host_diff + 1)
+
+        record = {'host' => host}
+        record['pri'] = pri if pri
+
+        i = text.index(SPLIT_CHAR, cursor)
+
+        # message part
+        msg = if i.nil?  # for 'only non-space content case'
+                text.slice(cursor, text.bytesize)
+              else
+                if text[i - 1] == ':'.freeze
+                  if text[i - 2] == ']'.freeze
+                    j = text.index('['.freeze, cursor)
+                    record['ident'] = text.slice(cursor, j - cursor)
+                    record['pid'] = text.slice(j + 1, i - j - 3) # remove '[' / ']:'
+                  else
+                    record['ident'] = text.slice(cursor, i - cursor - 1)
+                  end
+                  text.slice(i + 1, text.bytesize)
+                else
+                  if @support_colonless_ident
+                    if text[i - 1] == ']'.freeze
+                      j = text.index('['.freeze, cursor)
+                      record['ident'] = text.slice(cursor, j - cursor)
+                      record['pid'] = text.slice(j + 1, i - j - 2) # remove '[' / ']'
+                    else
+                      record['ident'] = text.slice(cursor, i - cursor)
+                    end
+                    text.slice(i + 1, text.bytesize)
+                  else
+                    text.slice(cursor, text.bytesize)
+                  end
+                end
+              end
+        msg.chomp!
+        record['message'] = msg
+
+        time = @time_parser.parse(time_str.squeeze(SPLIT_CHAR))
+        record['time'] = time_str if @keep_time_key
+
+        yield time, record
+      end
     end
   end
 end
diff --git a/test/plugin/test_parser_syslog.rb b/test/plugin/test_parser_syslog.rb
@@ -14,8 +14,9 @@ def setup
     }
   end
 
-  def test_parse
-    @parser.configure({})
+  data('regexp' => 'regexp', 'string' => 'string')
+  def test_parse(param)
+    @parser.configure({'parser_type' => param})
     @parser.instance.parse('Feb 28 12:00:00 192.168.0.1 fluentd[11111]: [error] Syslog test') { |time, record|
       assert_equal(event_time('Feb 28 12:00:00', format: '%b %d %H:%M:%S'), time)
       assert_equal(@expected, record)
@@ -24,17 +25,28 @@ def test_parse
     assert_equal("%b %d %H:%M:%S", @parser.instance.patterns['time_format'])
   end
 
-  def test_parse_with_time_format
-    @parser.configure('time_format' => '%b %d %M:%S:%H')
+  data('regexp' => 'regexp', 'string' => 'string')
+  def test_parse_with_time_format(param)
+    @parser.configure('time_format' => '%b %d %M:%S:%H', 'parser_type' => param)
     @parser.instance.parse('Feb 28 00:00:12 192.168.0.1 fluentd[11111]: [error] Syslog test') { |time, record|
       assert_equal(event_time('Feb 28 12:00:00', format: '%b %d %H:%M:%S'), time)
       assert_equal(@expected, record)
     }
     assert_equal('%b %d %M:%S:%H', @parser.instance.patterns['time_format'])
   end
 
-  def test_parse_with_priority
-    @parser.configure('with_priority' => true)
+  data('regexp' => 'regexp', 'string' => 'string')
+  def test_parse_with_subsecond_time(param)
+    @parser.configure('time_format' => '%b %d %H:%M:%S.%N', 'parser_type' => param)
+    @parser.instance.parse('Feb 28 12:00:00.456 192.168.0.1 fluentd[11111]: [error] Syslog test') { |time, record|
+      assert_equal(event_time('Feb 28 12:00:00.456', format: '%b %d %H:%M:%S.%N'), time)
+      assert_equal(@expected, record)
+    }
+  end
+
+  data('regexp' => 'regexp', 'string' => 'string')
+  def test_parse_with_priority(param)
+    @parser.configure('with_priority' => true, 'parser_type' => param)
     @parser.instance.parse('<6>Feb 28 12:00:00 192.168.0.1 fluentd[11111]: [error] Syslog test') { |time, record|
       assert_equal(event_time('Feb 28 12:00:00', format: '%b %d %H:%M:%S'), time)
       assert_equal(@expected.merge('pri' => 6), record)
@@ -43,8 +55,9 @@ def test_parse_with_priority
     assert_equal("%b %d %H:%M:%S", @parser.instance.patterns['time_format'])
   end
 
-  def test_parse_without_colon
-    @parser.configure({})
+  data('regexp' => 'regexp', 'string' => 'string')
+  def test_parse_without_colon(param)
+    @parser.configure({'parser_type' => param})
     @parser.instance.parse('Feb 28 12:00:00 192.168.0.1 fluentd[11111] [error] Syslog test') { |time, record|
       assert_equal(event_time('Feb 28 12:00:00', format: '%b %d %H:%M:%S'), time)
       assert_equal(@expected, record)
@@ -53,35 +66,100 @@ def test_parse_without_colon
     assert_equal("%b %d %H:%M:%S", @parser.instance.patterns['time_format'])
   end
 
-  def test_parse_with_keep_time_key
+  data('regexp' => 'regexp', 'string' => 'string')
+  def test_parse_with_keep_time_key(param)
     @parser.configure(
                       'time_format' => '%b %d %M:%S:%H',
                       'keep_time_key'=>'true',
+                      'parser_type' => param
                       )
     text = 'Feb 28 00:00:12 192.168.0.1 fluentd[11111]: [error] Syslog test'
     @parser.instance.parse(text) do |time, record|
       assert_equal "Feb 28 00:00:12", record['time']
     end
   end
 
-  def test_parse_various_characters_for_tag
+  data('regexp' => 'regexp', 'string' => 'string')
+  def test_parse_various_characters_for_tag(param)
     ident = '~!@#$%^&*()_+=-`]{};"\'/?\\,.<>'
-    @parser.configure({})
+    @parser.configure({'parser_type' => param})
     @parser.instance.parse("Feb 28 12:00:00 192.168.0.1 #{ident}[11111]: [error] Syslog test") { |time, record|
       assert_equal(event_time('Feb 28 12:00:00', format: '%b %d %H:%M:%S'), time)
       assert_equal(@expected.merge('ident' => ident), record)
     }
   end
 
-  def test_parse_various_characters_for_tag_with_priority
+  data('regexp' => 'regexp', 'string' => 'string')
+  def test_parse_various_characters_for_tag_with_priority(param)
     ident = '~!@#$%^&*()_+=-`]{};"\'/?\\,.<>'
-    @parser.configure('with_priority' => true)
+    @parser.configure('with_priority' => true, 'parser_type' => param)
     @parser.instance.parse("<6>Feb 28 12:00:00 192.168.0.1 #{ident}[11111]: [error] Syslog test") { |time, record|
       assert_equal(event_time('Feb 28 12:00:00', format: '%b %d %H:%M:%S'), time)
       assert_equal(@expected.merge('pri' => 6, 'ident' => ident), record)
     }
   end
 
+  sub_test_case 'Check the difference of regexp and string parser' do
+    # examples from rfc3164
+    data('regexp' => 'regexp', 'string' => 'string')
+    test 'wrong result with no ident message by default' do |param|
+      @parser.configure('parser_type' => param)
+      @parser.instance.parse('Feb  5 17:32:18 10.0.0.99 Use the BFG!') { |time, record|
+        assert_equal({'host' => '10.0.0.99', 'ident' => 'Use', 'message' => 'the BFG!'}, record)
+      }
+    end
+
+    test "proper result with no ident message by 'support_colonless_ident false'" do
+      @parser.configure('parser_type' => 'string', 'support_colonless_ident' => false)
+      @parser.instance.parse('Feb  5 17:32:18 10.0.0.99 Use the BFG!') { |time, record|
+        assert_equal({'host' => '10.0.0.99', 'message' => 'Use the BFG!'}, record)
+      }
+    end
+
+    test "string parsers can't parse broken syslog message and generate wrong record" do
+      @parser.configure('parser_type' => 'string')
+      @parser.instance.parse("1990 Oct 22 10:52:01 TZ-6 scapegoat.dmz.example.org 10.1.2.32 sched[0]: That's All Folks!") { |time, record|
+        expected = {'host' => 'scapegoat.dmz.example.org', 'ident' => 'sched', 'pid' => '0', 'message' => "That's All Folks!"}
+        assert_not_equal(expected, record)
+      }
+    end
+
+    test "regexp parsers can't parse broken syslog message and raises an error" do
+      @parser.configure('parser_type' => 'regexp')
+      assert_raise(Fluent::TimeParser::TimeParseError) {
+        @parser.instance.parse("1990 Oct 22 10:52:01 TZ-6 scapegoat.dmz.example.org 10.1.2.32 sched[0]: That's All Folks!") { |time, record| }
+      }
+    end
+
+    data('regexp' => 'regexp', 'string' => 'string')
+    test "':' included message breaks regexp parser" do |param|
+      @parser.configure('parser_type' => param)
+      @parser.instance.parse('Aug 10 12:00:00 127.0.0.1 test foo:bar') { |time, record|
+        expected = {'host' => '127.0.0.1', 'ident' => 'test', 'message' => 'foo:bar'}
+        if param == 'string'
+          assert_equal(expected, record)
+        else
+          assert_not_equal(expected, record)
+        end
+      }
+    end
+
+    data('regexp' => 'regexp', 'string' => 'string')
+    test "Only no whitespace content in MSG causes different result" do |param|
+      @parser.configure('parser_type' => param)
+      @parser.instance.parse('Aug 10 12:00:00 127.0.0.1 value1,value2,value3,value4') { |time, record|
+        # 'message' is correct but regexp set it as 'ident'
+        if param == 'string'
+          expected = {'host' => '127.0.0.1', 'message' => 'value1,value2,value3,value4'}
+          assert_equal(expected, record)
+        else
+          expected = {'host' => '127.0.0.1', 'ident' => 'value1,value2,value3,value4', 'message' => ''}
+          assert_equal(expected, record)
+        end
+      }
+    end
+  end
+
   class TestRFC5424Regexp < self
     def test_parse_with_rfc5424_message
       @parser.configure(
@@ -273,10 +351,12 @@ def test_parse_with_rfc5424_message_both_timestamp
   end
 
   class TestAutoRegexp < self
-    def test_auto_with_legacy_syslog_message
+    data('regexp' => 'regexp', 'string' => 'string')
+    def test_auto_with_legacy_syslog_message(param)
       @parser.configure(
                         'time_format' => '%b %d %M:%S:%H',
                         'message_format' => 'auto',
+                        'parser_type' => param
                         )
       text = 'Feb 28 00:00:12 192.168.0.1 fluentd[11111]: [error] Syslog test'
       @parser.instance.parse(text) do |time, record|
@@ -286,11 +366,13 @@ def test_auto_with_legacy_syslog_message
       assert_equal(Fluent::Plugin::SyslogParser::REGEXP, @parser.instance.patterns['format'])
     end
 
-    def test_auto_with_legacy_syslog_priority_message
+    data('regexp' => 'regexp', 'string' => 'string')
+    def test_auto_with_legacy_syslog_priority_message(param)
       @parser.configure(
                         'time_format' => '%b %d %M:%S:%H',
                         'with_priority' => true,
                         'message_format' => 'auto',
+                        'parser_type' => param
                         )
       text = '<6>Feb 28 12:00:00 192.168.0.1 fluentd[11111]: [error] Syslog test'
       @parser.instance.parse(text) do |time, record|
@@ -300,11 +382,13 @@ def test_auto_with_legacy_syslog_priority_message
       assert_equal(Fluent::Plugin::SyslogParser::REGEXP_WITH_PRI, @parser.instance.patterns['format'])
     end
 
-    def test_parse_with_rfc5424_message
+    data('regexp' => 'regexp', 'string' => 'string')
+    def test_parse_with_rfc5424_message(param)
       @parser.configure(
                         'time_format' => '%Y-%m-%dT%H:%M:%S.%L%z',
                         'message_format' => 'auto',
                         'with_priority' => true,
+                        'parser_type' => param
                         )
       text = '<16>1 2017-02-06T13:14:15.003Z 192.168.0.1 fluentd - - - Hi, from Fluentd!'
       @parser.instance.parse(text) do |time, record|
@@ -318,11 +402,13 @@ def test_parse_with_rfc5424_message
                    @parser.instance.patterns['format'])
     end
 
-    def test_parse_with_rfc5424_structured_message
+    data('regexp' => 'regexp', 'string' => 'string')
+    def test_parse_with_rfc5424_structured_message(param)
       @parser.configure(
                         'time_format' => '%Y-%m-%dT%H:%M:%S.%L%z',
                         'message_format' => 'auto',
                         'with_priority' => true,
+                        'parser_type' => param
                         )
       text = '<16>1 2017-02-06T13:14:15.003Z 192.168.0.1 fluentd 11111 ID24224 [exampleSDID@20224 iut="3" eventSource="Application" eventID="11211"] Hi, from Fluentd!'
       @parser.instance.parse(text) do |time, record|
@@ -337,12 +423,14 @@ def test_parse_with_rfc5424_structured_message
                    @parser.instance.patterns['format'])
     end
 
-    def test_parse_with_both_message_type
+    data('regexp' => 'regexp', 'string' => 'string')
+    def test_parse_with_both_message_type(param)
       @parser.configure(
         'time_format' => '%b %d %M:%S:%H',
         'rfc5424_time_format' => '%Y-%m-%dT%H:%M:%S.%L%z',
         'message_format' => 'auto',
         'with_priority' => true,
+        'parser_type' => param
       )
       text = '<1>Feb 28 12:00:00 192.168.0.1 fluentd[11111]: [error] Syslog test'
       @parser.instance.parse(text) do |time, record|
@@ -382,12 +470,14 @@ def test_parse_with_both_message_type
                    @parser.instance.patterns['format'])
     end
 
-    def test_parse_with_both_message_type_and_priority
+    data('regexp' => 'regexp', 'string' => 'string')
+    def test_parse_with_both_message_type_and_priority(param)
       @parser.configure(
                         'time_format' => '%b %d %M:%S:%H',
                         'rfc5424_time_format' => '%Y-%m-%dT%H:%M:%S.%L%z',
                         'with_priority' => true,
                         'message_format' => 'auto',
+                        'parser_type' => param
                         )
       text = '<6>Feb 28 12:00:00 192.168.0.1 fluentd[11111]: [error] Syslog test'
       @parser.instance.parse(text) do |time, record|