You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
We are using fluentd for log collection from out linux and windows nodepools on AKS clusters. since last few months we have noticed we are paying more on bandwidth than other resources, when digged i found out fluentd is sending lot more traffic to elasticsearch than actual logs received at elasticsearch end.
for example we are sending around 200GB of logs to destination however bandwidth or total bytes sent by fluentd pods are way more, around 10TB everyday which is costing us insane amount of money.
tried upgrading fluentd to latest version but that did not make much of difference. i could not find anybody else on internet facing similar issue so i need to find out what is wrong with my fluentd deployment and what can i do to fix this issue.
i would greatly appreciate your time and efforts in helping me on this one.
here is fluentd config on my clusters:
To Reproduce
deployed fluentd with below config on aks cluster:
send logs to other aks cluster having elasticsearch and grafana
Expected behavior
fluentd consuming bandwidth not more than 10-15% in addition to actual data sent.
Your Environment
- Fluentd version: fluentd:v1.17
- Package version:
- Operating system: ubuntu 20 and winodws server 2022
- Kernel version:
Your Configuration
` containers.conf: |
<source>
@type tail
@id in_tail_container_logs
path /var/log/containers/*.log
pos_file /var/log/fluentd-containers.log.pos
tag "#{ENV['FLUENT_CONTAINER_TAIL_TAG'] || 'kubernetes.*'}"
exclude_path "#{ENV['FLUENT_CONTAINER_TAIL_EXCLUDE_PATH'] || '["/var/log/containers/metricbeat*", "/var/log/containers/*calico-system*", "/var/log/containers/microsoft-defender-*", "/var/log/containers/*gatekeeper-system*", "/var/log/containers/*kube-system*", "/var/log/containers/*kube-public*", "/var/log/containers/*kube-node-lease*", "/var/log/containers/*tigera-operator*", "/var/log/containers/*prometheus*", "/var/log/containers/fluent-daemon*"]'}"
read_from_head false
@include tail_container_parse.conf
encoding utf-8
</source>
elastic.conf: |
<filter **>
@type record_modifier
char_encoding ISO-8859-1:utf-8
char_encoding utf-8
</filter>
<match **>
@type elasticsearch
@id out_es
@log_level "#{ENV['FLUENT_LOG_LEVEL'] || 'ERROR' }"
include_tag_key true
suppress_type_name true
hosts "#{ENV['FLUENT_ELASTICSEARCH_HOSTS'] || 'elasticsearch:9200' }"
path "#{ENV['FLUENT_ELASTICSEARCH_PATH']}"
scheme "#{ENV['FLUENT_ELASTICSEARCH_SCHEME'] || 'http'}"
ssl_verify "#{ENV['FLUENT_ELASTICSEARCH_SSL_VERIFY'] || 'true'}"
ssl_version "#{ENV['FLUENT_ELASTICSEARCH_SSL_VERSION'] || 'TLSv1_2'}"
default_elasticsearch_version "#{ENV['FLUENT_ELASTICSEARCH_VRSN'] || '8'}"
user "#{ENV['FLUENT_ELASTICSEARCH_USER'] || use_default}"
password "#{ENV['FLUENT_ELASTICSEARCH_PASSWORD'] || use_default}"
reload_connections "#{ENV['FLUENT_ELASTICSEARCH_RELOAD_CONNECTIONS'] || 'false'}"
reconnect_on_error "#{ENV['FLUENT_ELASTICSEARCH_RECONNECT_ON_ERROR'] || 'true'}"
reload_on_failure "#{ENV['FLUENT_ELASTICSEARCH_RELOAD_ON_FAILURE'] || 'true'}"
log_es_400_reason "#{ENV['FLUENT_ELASTICSEARCH_LOG_ES_400_REASON'] || 'false'}"
logstash_prefix "#{ENV['FLUENT_ELASTICSEARCH_LOGSTASH_PREFIX'] || 'logstash'}"
logstash_dateformat "#{ENV['FLUENT_ELASTICSEARCH_LOGSTASH_DATEFORMAT'] || '%Y.%m.%d'}"
logstash_format "#{ENV['FLUENT_ELASTICSEARCH_LOGSTASH_FORMAT'] || 'true'}"
index_name "#{ENV['FLUENT_ELASTICSEARCH_LOGSTASH_INDEX_NAME'] || 'logstash'}"
target_index_key "#{ENV['FLUENT_ELASTICSEARCH_TARGET_INDEX_KEY'] || use_nil}"# commenting out type_name, type was removed from ES in 8.x#type_name "#{ENV['FLUENT_ELASTICSEARCH_LOGSTASH_TYPE_NAME'] || 'fluentd'}"
include_timestamp "#{ENV['FLUENT_ELASTICSEARCH_INCLUDE_TIMESTAMP'] || 'false'}"
template_name "#{ENV['FLUENT_ELASTICSEARCH_TEMPLATE_NAME'] || use_nil}"
template_file "#{ENV['FLUENT_ELASTICSEARCH_TEMPLATE_FILE'] || use_nil}"
template_overwrite "#{ENV['FLUENT_ELASTICSEARCH_TEMPLATE_OVERWRITE'] || use_default}"
request_timeout "#{ENV['FLUENT_ELASTICSEARCH_REQUEST_TIMEOUT'] || '120s'}"
application_name "#{ENV['FLUENT_ELASTICSEARCH_APPLICATION_NAME'] || use_default}"
enable_ilm "#{ENV['FLUENT_ELASTICSEARCH_ENABLE_ILM'] || 'false'}"
customize_template "#{ENV['FLUENT_ELASTICSEARCH_CUSTOMIZE_TEMPLATE'] || use_default}"
rollover_index "#{ENV['FLUENT_ELASTICSEARCH_ROLLOVER_INDEX'] || 'false'}"
index_date_pattern "#{ENV['FLUENT_ELASTICSEARCH_ROLLOVER_INDEX'] || 'now/d'}"
ilm_policy_id "#{ENV['FLUENT_ELASTICSEARCH_ILM_POLICY_ID'] || use_default}"
ilm_policy "#{ENV['FLUENT_ELASTICSEARCH_ILM_POLICY'] || use_default}"
ilm_policy_overwrite "#{ENV['FLUENT_ELASTICSEARCH_ILM_POLICY_OVERWRITE'] || 'false'}"
pipeline fluentd_logs
verify_es_version_at_startup false
<buffer>
@type file
path /fluentd/log/elastic-buffer
flush_thread_count 20
flush_mode interval
flush_interval 15s
chunk_limit_size 256M
queue_limit_length 512
retry_max_interval 30
retry_forever false
</buffer>
</match>
fluent_out.conf: |
<match @FLUENT_LOG>
@type stdout
</match>
forwarder.conf: |
#<source># # This is used by windows VMs to proxy logs to elasticsearch# @type forward# port 24224# bind 0.0.0.0#</source>
kubernetes.conf: |
<label @FLUENT_LOG>
<match fluent.**>
@type null
@id ignore_fluent_logs
</match>
</label>
<source>
@type tail
@id in_tail_minion
path /var/log/salt/minion
pos_file /var/log/fluentd-salt.pos
tag salt
<parse>
@type regexp
expression /^(?<time>[^ ]* [^ ,]*)[^\[]*\[[^\]]*\]\[(?<severity>[^ \]]*) *\] (?<message>.*)$/
time_format %Y-%m-%d %H:%M:%S
</parse>
</source>
<source>
@type tail
@id in_tail_startupscript
path /var/log/startupscript.log
pos_file /var/log/fluentd-startupscript.log.pos
tag startupscript
<parse>
@type syslog
</parse>
</source>
<source>
@type tail
@id in_tail_docker
path /var/log/docker.log
pos_file /var/log/fluentd-docker.log.pos
tag docker
<parse>
@type regexp
expression /^time="(?<time>[^)]*)" level=(?<severity>[^ ]*) msg="(?<message>[^"]*)"( err="(?<error>[^"]*)")?( statusCode=($<status_code>\d+))?/
</parse>
</source>
<source>
@type tail
@id in_tail_etcd
path /var/log/etcd.log
pos_file /var/log/fluentd-etcd.log.pos
tag etcd
<parse>
@type none
</parse>
</source>
<source>
@type tail
@id in_tail_kubelet
multiline_flush_interval 5s
path /var/log/kubelet.log
pos_file /var/log/fluentd-kubelet.log.pos
tag kubelet
<parse>
@type kubernetes
</parse>
</source>
<source>
@type tail
@id in_tail_kube_proxy
multiline_flush_interval 5s
path /var/log/kube-proxy.log
pos_file /var/log/fluentd-kube-proxy.log.pos
tag kube-proxy
<parse>
@type kubernetes
</parse>
</source>
<source>
@type tail
@id in_tail_kube_apiserver
multiline_flush_interval 5s
path /var/log/kube-apiserver.log
pos_file /var/log/fluentd-kube-apiserver.log.pos
tag kube-apiserver
<parse>
@type kubernetes
</parse>
</source>
<source>
@type tail
@id in_tail_kube_controller_manager
multiline_flush_interval 5s
path /var/log/kube-controller-manager.log
pos_file /var/log/fluentd-kube-controller-manager.log.pos
tag kube-controller-manager
<parse>
@type kubernetes
</parse>
</source>
<source>
@type tail
@id in_tail_kube_scheduler
multiline_flush_interval 5s
path /var/log/kube-scheduler.log
pos_file /var/log/fluentd-kube-scheduler.log.pos
tag kube-scheduler
<parse>
@type kubernetes
</parse>
</source>
<source>
@type tail
@id in_tail_rescheduler
multiline_flush_interval 5s
path /var/log/rescheduler.log
pos_file /var/log/fluentd-rescheduler.log.pos
tag rescheduler
<parse>
@type kubernetes
</parse>
</source>
<source>
@type tail
@id in_tail_glbc
multiline_flush_interval 5s
path /var/log/glbc.log
pos_file /var/log/fluentd-glbc.log.pos
tag glbc
<parse>
@type kubernetes
</parse>
</source>
<source>
@type tail
@id in_tail_cluster_autoscaler
multiline_flush_interval 5s
path /var/log/cluster-autoscaler.log
pos_file /var/log/fluentd-cluster-autoscaler.log.pos
tag cluster-autoscaler
<parse>
@type kubernetes
</parse>
</source>
# Example:# 2017-02-09T00:15:57.992775796Z AUDIT: id="90c73c7c-97d6-4b65-9461-f94606ff825f" ip="104.132.1.72" method="GET" user="kubecfg" as="<self>" asgroups="<lookup>" namespace="default" uri="/api/v1/namespaces/default/pods"# 2017-02-09T00:15:57.993528822Z AUDIT: id="90c73c7c-97d6-4b65-9461-f94606ff825f" response="200"
<source>
@type tail
@id in_tail_kube_apiserver_audit
multiline_flush_interval 5s
path /var/log/kubernetes/kube-apiserver-audit.log
pos_file /var/log/kube-apiserver-audit.log.pos
tag kube-apiserver-audit
<parse>
@type multiline
format_firstline /^\S+\s+AUDIT:/
# Fields must be explicitly captured by name to be parsed into the record.# Fields may not always be present, and order may change, so this just looks# for a list of key="\"quoted\" value" pairs separated by spaces.# Unknown fields are ignored.# Note: We can't separate query/response lines as format1/format2 because# they don't always come one after the other for a given query.
format1 /^(?<time>\S+) AUDIT:(?: (?:id="(?<id>(?:[^"\\]|\\.)*)"|ip="(?<ip>(?:[^"\\]|\\.)*)"|method="(?<method>(?:[^"\\]|\\.)*)"|user="(?<user>(?:[^"\\]|\\.)*)"|groups="(?<groups>(?:[^"\\]|\\.)*)"|as="(?<as>(?:[^"\\]|\\.)*)"|asgroups="(?<asgroups>(?:[^"\\]|\\.)*)"|namespace="(?<namespace>(?:[^"\\]|\\.)*)"|uri="(?<uri>(?:[^"\\]|\\.)*)"|response="(?<response>(?:[^"\\]|\\.)*)"|\w+="(?:[^"\\]|\\.)*"))*/ time_format %Y-%m-%dT%T.%L%Z </parse> </source> <filter kubernetes.**> @type kubernetes_metadata @id filter_kube_metadata kubernetes_url "#{ENV['FLUENT_FILTER_KUBERNETES_URL'] || 'https://' + ENV.fetch('KUBERNETES_SERVICE_HOST') + ':' + ENV.fetch('KUBERNETES_SERVICE_PORT') + '/api'}"
verify_ssl "#{ENV['KUBERNETES_VERIFY_SSL'] || true}"
ca_file "#{ENV['KUBERNETES_CA_FILE']}"
skip_labels "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_LABELS'] || 'false'}"
skip_container_metadata "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_CONTAINER_METADATA'] || 'false'}"
skip_master_url "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_MASTER_URL'] || 'false'}"
skip_namespace_metadata "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_NAMESPACE_METADATA'] || 'false'}"
watch "#{ENV['FLUENT_KUBERNETES_WATCH'] || 'true'}"
</filter>
<filter kubernetes.**>
@type dedot
de_dot true
de_dot_separator _
de_dot_nested true
</filter>
main.conf: |
@include /fluentd/etc/prometheus.conf
@include /fluentd/etc/containers.conf
@include /fluentd/etc/kubernetes.conf
@include /fluentd/etc/elastic.conf
@include /fluentd/etc/fluent_out.conf
@include /fluentd/etc/forwarder.conf
# Include Platform specfic configurations
@include /fluentd/etc/nix.d/*
@include /fluentd/etc/win.d/*
# Include all custom configurations
@include /fluentd/etc/conf.d/*
prometheus.conf: |
<source>
@type prometheus
@id in_prometheus
bind "#{ENV['FLUENTD_PROMETHEUS_BIND'] || '0.0.0.0'}"
port "#{ENV['FLUENTD_PROMETHEUS_PORT'] || '24231'}"
metrics_path "#{ENV['FLUENTD_PROMETHEUS_PATH'] || '/metrics'}"
</source>
<source>
@type prometheus_output_monitor
@id in_prometheus_output_monitor
interval 10
<labels>
hostname ${hostname}
</labels>
</source>
# count the number of incoming records per tag
<filter **>
@type prometheus
<metric>
name fluentd_input_status_num_records_total
type counter
desc The total number of incoming records
<labels>
tag ${tag}
hostname ${hostname}
</labels>
</metric>
</filter>
tail_container_parse.conf: |
<parse>
@type multi_format
<pattern>
format cri
<parse>
@type json
</parse>
</pattern>
<pattern>
format cri
</pattern>
<pattern>
format none
</pattern>
</parse>
transforms.conf: |
<filter kubernetes.**>
@type record_transformer
enable_ruby
<record>
host.name ${record.dig("kubernetes", "host")}
kubernetes.pod.uid ${record.dig("kubernetes", "pod_id")}
</record>
</filter>`
Your Error Log
No errors spotted concerned to this issue
Additional context
No response
The text was updated successfully, but these errors were encountered:
Describe the bug
Hi,
We are using fluentd for log collection from out linux and windows nodepools on AKS clusters. since last few months we have noticed we are paying more on bandwidth than other resources, when digged i found out fluentd is sending lot more traffic to elasticsearch than actual logs received at elasticsearch end.
for example we are sending around 200GB of logs to destination however bandwidth or total bytes sent by fluentd pods are way more, around 10TB everyday which is costing us insane amount of money.
tried upgrading fluentd to latest version but that did not make much of difference. i could not find anybody else on internet facing similar issue so i need to find out what is wrong with my fluentd deployment and what can i do to fix this issue.
i would greatly appreciate your time and efforts in helping me on this one.
here is fluentd config on my clusters:
To Reproduce
Expected behavior
fluentd consuming bandwidth not more than 10-15% in addition to actual data sent.
Your Environment
Your Configuration
Your Error Log
Additional context
No response
The text was updated successfully, but these errors were encountered: