From c6df8ac3a4e1639a662e6c6d10e2417e05ef7b90 Mon Sep 17 00:00:00 2001 From: Rich Megginson Date: Mon, 7 Oct 2019 09:37:52 -0600 Subject: [PATCH] Bug 1756920: fluentd pods do not process kubernetes events https://bugzilla.redhat.com/show_bug.cgi?id=1756920 This tries to keep the functionality the same in 4.x as it was in 3.x. In 3.x, you had to: - deploy the eventrouter with a pod name like "logging-eventrouter-*" - deploy the eventrouter in the default namespace - set `TRANSFORM_EVENTS=true` - set `MERGE_JSON_LOG=true` (which was the default in 3.x) In 4.x, the pod name changes to "eventrouter-*" to follow the convention of our other logging pods. The eventrouter defaults to running in "openshift-logging" - it doesn't really matter, as long as it is running in an "infra" namespace. This change also enables `MERGE_JSON_LOG=true` for eventrouter pods. The biggest problem is that you have to set the cluster to unmanaged in order to set `TRANSFORM_EVENTS=true`. We could workaround that by making `TRANSFORM_EVENTS=true` the default value. That would cause every kubernetes record to be checked to see if it looks like and event, and process it as such if it matches. I'm not sure what the performance implications would be. Also ports eventrouter test to 4.x Also adds a test for eventrouter Info type support update eventrouter to pick up fix for Bug 1701495 --- eventrouter | 2 +- .../configs.d/openshift/filter-k8s-meta.conf | 7 + .../openshift/filter-post-genid.conf | 2 +- .../openshift/filter-viaq-data-model.conf | 1 + fluentd/run.sh | 5 - hack/test-logging.sh | 2 +- .../templates/eventrouter_template.yaml | 97 +++++++++++ .../logging-ci-test-runner-template.yaml | 21 ++- test/eventrouter.sh | 152 +++++++++++++----- 9 files changed, 234 insertions(+), 55 deletions(-) create mode 100644 hack/testing/templates/eventrouter_template.yaml diff --git a/eventrouter b/eventrouter index fec0a81d9..7c289ccf0 160000 --- a/eventrouter +++ b/eventrouter @@ -1 +1 @@ -Subproject commit fec0a81d9966c93bd544d5407be79001034e0c25 +Subproject commit 7c289ccf0ee93cf32ed19685bbd005a5c69749f1 diff --git a/fluentd/configs.d/openshift/filter-k8s-meta.conf b/fluentd/configs.d/openshift/filter-k8s-meta.conf index a986c93ea..fbd5734a3 100644 --- a/fluentd/configs.d/openshift/filter-k8s-meta.conf +++ b/fluentd/configs.d/openshift/filter-k8s-meta.conf @@ -20,3 +20,10 @@ preserve_json_log "#{ENV['PRESERVE_JSON_LOG'] || 'true'}" json_fields "#{ENV['JSON_FIELDS'] || 'log,MESSAGE'}" + + + @type parse_json_field + merge_json_log true + preserve_json_log true + json_fields "#{ENV['JSON_FIELDS'] || 'log,MESSAGE'}" + diff --git a/fluentd/configs.d/openshift/filter-post-genid.conf b/fluentd/configs.d/openshift/filter-post-genid.conf index 46f495b73..2ed58647b 100644 --- a/fluentd/configs.d/openshift/filter-post-genid.conf +++ b/fluentd/configs.d/openshift/filter-post-genid.conf @@ -4,5 +4,5 @@ @type elasticsearch_genid_ext hash_id_key viaq_msg_id alt_key kubernetes.event.metadata.uid - alt_tags "#{ENV['GENID_ALT_TAG'] || 'kubernetes.var.log.containers.logging-eventrouter-*.** kubernetes.journal.container._default_.kubernetes-event'}" + alt_tags "#{ENV['GENID_ALT_TAG'] || 'kubernetes.var.log.containers.logging-eventrouter-*.** kubernetes.var.log.containers.eventrouter-*.** kubernetes.var.log.containers.cluster-logging-eventrouter-*.** kubernetes.journal.container._default_.kubernetes-event'}" diff --git a/fluentd/configs.d/openshift/filter-viaq-data-model.conf b/fluentd/configs.d/openshift/filter-viaq-data-model.conf index 14c0380d1..70dae3a29 100644 --- a/fluentd/configs.d/openshift/filter-viaq-data-model.conf +++ b/fluentd/configs.d/openshift/filter-viaq-data-model.conf @@ -13,6 +13,7 @@ undefined_to_string "#{ENV['CDM_UNDEFINED_TO_STRING'] || 'false'}" undefined_dot_replace_char "#{ENV['CDM_UNDEFINED_DOT_REPLACE_CHAR'] || 'UNUSED'}" undefined_max_num_fields "#{ENV['CDM_UNDEFINED_MAX_NUM_FIELDS'] || '-1'}" + process_kubernetes_events "#{ENV['TRANSFORM_EVENTS'] || 'false'}" enabled false tag "audit.log**" diff --git a/fluentd/run.sh b/fluentd/run.sh index 853d5eb07..ddcd5dd45 100644 --- a/fluentd/run.sh +++ b/fluentd/run.sh @@ -229,11 +229,6 @@ if [[ "${USE_REMOTE_SYSLOG:-}" = "true" ]] ; then fi fi -# Disable process_kubernetes_events if TRANSFORM_EVENTS is false client. -if [ "${TRANSFORM_EVENTS:-}" != true ] ; then - sed -i 's/\(.*@type viaq_data_model.*\)/\1\n process_kubernetes_events false/' $CFG_DIR/openshift/filter-viaq-data-model.conf -fi - if [ "${AUDIT_CONTAINER_ENGINE:-}" = "true" ] ; then cp -f $CFG_DIR/input-pre-audit-log.conf $CFG_DIR/openshift cp -f $CFG_DIR/filter-pre-a-audit-exclude.conf $CFG_DIR/openshift diff --git a/hack/test-logging.sh b/hack/test-logging.sh index f9c0716e8..aa2d35d7a 100755 --- a/hack/test-logging.sh +++ b/hack/test-logging.sh @@ -141,7 +141,7 @@ if [ -n "${TEST_SUITES:-}" ] ; then fi oc process -p TEST_ROOT=$testroot \ -p TEST_NAMESPACE_NAME=$( oc project -q ) \ - -p TEST_IMAGE=$testimage \ + -p TEST_IMAGE=$testimage -p IMAGE_FORMAT="${IMAGE_FORMAT:-}" \ ${artifact_dir_arg:-} ${test_suites_arg:-} \ -f hack/testing/templates/logging-ci-test-runner-template.yaml | oc create -f - diff --git a/hack/testing/templates/eventrouter_template.yaml b/hack/testing/templates/eventrouter_template.yaml new file mode 100644 index 000000000..d567511ee --- /dev/null +++ b/hack/testing/templates/eventrouter_template.yaml @@ -0,0 +1,97 @@ +kind: Template +apiVersion: v1 +metadata: + name: eventrouter-template + annotations: + description: "A pod forwarding kubernetes events to cluster logging stack." + tags: "events,EFK,logging,cluster-logging" +objects: + - kind: ServiceAccount + apiVersion: v1 + metadata: + name: eventrouter + namespace: ${NAMESPACE} + - kind: ClusterRole + apiVersion: v1 + metadata: + name: event-reader + rules: + - apiGroups: [""] + resources: ["events"] + verbs: ["get", "watch", "list"] + - kind: ClusterRoleBinding + apiVersion: v1 + metadata: + name: event-reader-binding + subjects: + - kind: ServiceAccount + name: eventrouter + namespace: ${NAMESPACE} + roleRef: + kind: ClusterRole + name: event-reader + - kind: ConfigMap + apiVersion: v1 + metadata: + name: eventrouter + namespace: ${NAMESPACE} + data: + config.json: |- + { + "sink": "stdout" + } + - kind: Deployment + apiVersion: apps/v1 + metadata: + name: eventrouter + namespace: ${NAMESPACE} + labels: + component: eventrouter + logging-infra: eventrouter + provider: openshift + spec: + selector: + matchLabels: + component: eventrouter + logging-infra: eventrouter + provider: openshift + replicas: 1 + template: + metadata: + labels: + component: eventrouter + logging-infra: eventrouter + provider: openshift + name: eventrouter + spec: + serviceAccount: eventrouter + containers: + - name: kube-eventrouter + image: ${IMAGE} + imagePullPolicy: IfNotPresent + resources: + limits: + memory: ${MEMORY} + requests: + cpu: ${CPU} + memory: ${MEMORY} + volumeMounts: + - name: config-volume + mountPath: /etc/eventrouter + volumes: + - name: config-volume + configMap: + name: eventrouter +parameters: + - name: IMAGE + displayName: Image + value: "registry.redhat.io/openshift4/ose-logging-eventrouter:latest" + - name: MEMORY + displayName: Memory + value: "128Mi" + - name: CPU + displayName: CPU + value: "100m" + - name: NAMESPACE + displayName: Namespace + value: "openshift-logging" diff --git a/hack/testing/templates/logging-ci-test-runner-template.yaml b/hack/testing/templates/logging-ci-test-runner-template.yaml index 4c466f074..83084d477 100644 --- a/hack/testing/templates/logging-ci-test-runner-template.yaml +++ b/hack/testing/templates/logging-ci-test-runner-template.yaml @@ -20,6 +20,15 @@ objects: imagePullPolicy: Always securityContext: privileged: true + env: + - name: IMAGE_FORMAT + value: "${IMAGE_FORMAT}" + - name: SUITE + value: "${TEST_SUITES}" + - name: ARTIFACT_DIR + value: "${ARTIFACT_DIR}" + - name: KUBECONFIG + value: "/tmp/admin.kubeconfig" command: - bash - -c @@ -43,14 +52,7 @@ objects: # image was built with source already on it cd "${TEST_ROOT}" fi - cp /tmp/secret/admin.kubeconfig.orig /tmp/admin.kubeconfig - export KUBECONFIG=/tmp/admin.kubeconfig - set +u - # the below is a template parameter evaluation, not a shell - # variable evaluation - export ARTIFACT_DIR=${ARTIFACT_DIR} - set -u - export SUITE="${TEST_SUITES}" + cp /tmp/secret/admin.kubeconfig.orig $KUBECONFIG . openshift/ci-operator/build-image/launch-e2e-test.sh volumeMounts: @@ -143,5 +145,8 @@ parameters: - description: test suites to run value: '.*' name: TEST_SUITES +- description: IMAGE_FORMAT is used to construct image names - set by CI if used + value: "" + name: IMAGE_FORMAT labels: test: "true" diff --git a/test/eventrouter.sh b/test/eventrouter.sh index 490845c8f..44174225c 100755 --- a/test/eventrouter.sh +++ b/test/eventrouter.sh @@ -5,36 +5,93 @@ source "$(dirname "${BASH_SOURCE[0]}" )/../hack/lib/init.sh" source "${OS_O_A_L_DIR}/hack/testing/util.sh" -function get_eventrouter_pod() { - oc get pods --namespace=default -l component=eventrouter --no-headers | awk '$3 == "Running" {print $1}' +EXTERNAL_REGISTRY=${EXTERNAL_REGISTRY:-registry.svc.ci.openshift.org} +EXT_REG_IMAGE_NS=${EXT_REG_IMAGE_NS:-origin} +MASTER_VERSION=${MASTER_VERSION:-4.3} +get_eventrouter_image() { + local tagsuffix="${1:-latest}" + local ns=openshift + if [ -n "${IMAGE_FORMAT:-}" ] ; then + echo ${IMAGE_FORMAT/'${component}'/logging-eventrouter} + elif oc -n $ns get istag origin-logging-eventrouter:$tagsuffix > /dev/null 2>&1 ; then + oc -n $ns get istag origin-logging-eventrouter:$tagsuffix -o jsonpath='{.image.dockerImageReference}' + else + # fallback to latest externally available image + echo $EXTERNAL_REGISTRY/$EXT_REG_IMAGE_NS/$MASTER_VERSION:logging-eventrouter + fi +} + +deploy_eventrouter() { + local image=$( get_eventrouter_image ) + local ns=${LOGGING_NS:-openshift-logging} + + # I want the template to be usable as-is, by any external user, so I + # don't want to have the nodeSelector in the file - this assumes the + # deployment is the last element in the template + oc process -p NAMESPACE=$ns -p IMAGE=$image \ + -f ${OS_O_A_L_DIR}/hack/testing/templates/eventrouter_template.yaml | \ + jq '.items[-1].spec.template.spec.nodeSelector["logging-ci-test"]="true"' | \ + jq '.items[-1].spec.template.spec.containers[0].imagePullPolicy="Always"' | \ + oc create -f - 2>&1 | artifact_out + local looptries=4 + local ii + # not sure what's going on here - sometimes eventrouter will get an ErrImagePull + # due to authentication issue to internal cluster registry - restarting the pod + # usually makes it work(?????) + for ii in $(seq 1 $looptries) ; do + if os::cmd::try_until_text "get_running_pod eventrouter" eventrouter 2>&1 | artifact_out; then + os::log::info started eventrouter pod $(get_running_pod eventrouter) + ii=1 + break + else + oc delete pod -l component=eventrouter 2>&1 | artifact_out || : + fi + sleep 1 + done + if [ $ii -eq $looptries ] ; then + os::log::error could not start eventrouter pod after $looptries tries + exit 1 + fi } -evpod=$( get_eventrouter_pod ) -if [ -z "$evpod" ]; then - os::log::warning "Eventrouter not deployed" - exit 0 -fi os::test::junit::declare_suite_start "test/eventrouter" FLUENTD_WAIT_TIME=${FLUENTD_WAIT_TIME:-$(( 3 * minute ))} -muxmode=$( oc set env $fluentd_ds --list | grep \^MUX_CLIENT_MODE ) || : -if [ -z "${muxmode:-}" ] ; then - muxmode=MUX_CLIENT_MODE- -fi - cleanup() { local return_code="$?" set +e + if [ $return_code -ne 0 ] ; then + get_all_logging_pod_logs + if [ -n "${evpod:-}" ] ; then + oc logs $evpod > $ARTIFACT_DIR/$evpod.log 2>&1 + fi + fi + # turn off fluentd eventrouter mode stop_fluentd "" $FLUENTD_WAIT_TIME 2>&1 | artifact_out - oc set env $fluentd_ds $muxmode 2>&1 | artifact_out + oc set env $fluentd_ds TRANSFORM_EVENTS- 2>&1 | artifact_out start_fluentd false $FLUENTD_WAIT_TIME 2>&1 | artifact_out + oc process -f ${OS_O_A_L_DIR}/hack/testing/templates/eventrouter_template.yaml | \ + oc delete -f - 2>&1 | artifact_out + os::cmd::try_until_failure "oc get deploy/eventrouter > /dev/null 2>&1" # this will call declare_test_end, suite_end, etc. os::test::junit::reconcile_output exit $return_code } trap "cleanup" EXIT +# put fluentd in eventrouter mode +stop_fluentd "" $FLUENTD_WAIT_TIME 2>&1 | artifact_out +oc set env $fluentd_ds TRANSFORM_EVENTS=true 2>&1 | artifact_out +start_fluentd false $FLUENTD_WAIT_TIME 2>&1 | artifact_out + +deploy_eventrouter +evpod=$( get_running_pod eventrouter ) +if [ -z "$evpod" ]; then + os::log::warning "Eventrouter not deployed" + exit 0 +fi + function warn_nonformatted() { local es_svc=$1 local index=$2 @@ -42,6 +99,8 @@ function warn_nonformatted() { local non_formatted_event_count=$( curl_es $es_svc $index/_count?q=verb:* | get_count_from_json ) if [ "$non_formatted_event_count" != 0 ]; then os::log::warning "$non_formatted_event_count events from eventrouter in index $index were not processed by ViaQ fluentd plugin" + else + os::log::info "good - looks like all eventrouter events were processed by fluentd" fi } @@ -55,38 +114,53 @@ essvc=$( get_es_svc es ) esopssvc=$( get_es_svc es-ops ) esopssvc=${esopssvc:-$essvc} -# Make sure there's no MUX -# undeploy fluentd -stop_fluentd "" $FLUENTD_WAIT_TIME 2>&1 | artifact_out -oc set env $fluentd_ds MUX_CLIENT_MODE- 2>&1 | artifact_out -start_fluentd false 2>&1 | artifact_out - warn_nonformatted $essvc '/project.*' warn_nonformatted $esopssvc '/.operations.*' os::cmd::try_until_not_text "curl_es $esopssvc /.operations.*/_count?q=kubernetes.event.verb:* | get_count_from_json" "^0\$" $FLUENTD_WAIT_TIME prev_event_count=$( curl_es $esopssvc /.operations.*/_count?q=kubernetes.event.verb:* | get_count_from_json ) -# utilize mux if mux pod exists -if oc get dc/logging-mux > /dev/null 2>&1 ; then - # MUX_CLIENT_MODE: maximal - stop_fluentd "" $FLUENTD_WAIT_TIME 2>&1 | artifact_out - oc set env $fluentd_ds MUX_CLIENT_MODE=maximal 2>&1 | artifact_out - start_fluentd false $FLUENTD_WAIT_TIME 2>&1 | artifact_out - os::cmd::try_until_success "logs_count_is_gt $prev_event_count" $FLUENTD_WAIT_TIME - prev_event_count=$( curl_es $esopssvc /.operations.*/_count?q=kubernetes.event.verb:* | get_count_from_json ) +# Check if 1) the doc _id is the same as the kube id 2) there's no duplicates +curl_es $esopssvc /.operations.*/_search?pretty\&q=kubernetes.event:*\&size=9999 > $ARTIFACT_DIR/id-dup-search-raw.json 2>&1 +cat $ARTIFACT_DIR/id-dup-search-raw.json | jq -r '.hits.hits[] | ._id + " " + ._source.kubernetes.event.metadata.uid' | sort > $ARTIFACT_DIR/id-and-uid +os::cmd::expect_success "test -s $ARTIFACT_DIR/id-and-uid" +cat $ARTIFACT_DIR/id-and-uid | awk '{ + if ($1 != $2) {print "Error: es _id " $1 " not equal to kube uid " $2; exit 1} + if ($1 == last1) {print "Error: found duplicate es _id " $1; exit 1} + if ($2 == last2) {print "Error: found duplicate kube uid " $2; exit 1} + last1 = $1; last2 = $2 +}' + +oc apply -f - <&1 | artifact_out - oc set env $fluentd_ds MUX_CLIENT_MODE=minimal 2>&1 | artifact_out - start_fluentd false $FLUENTD_WAIT_TIME 2>&1 | artifact_out - os::cmd::try_until_success "logs_count_is_gt $prev_event_count" $FLUENTD_WAIT_TIME +if ! os::cmd::try_until_text "curl_es $esopssvc /.operations.*/_count?q=kubernetes.event.metadata.name:eventroutertest | get_count_from_json" "^1\$" $FLUENTD_WAIT_TIME ; then + os::log::error did not find 1 record with type Info + curl_es $esopssvc /.operations.*/_search?q=kubernetes.event.metadata.name:eventroutertest\&pretty > $ARTIFACT_DIR/info-search.json 2>&1 || : + exit 1 fi -# Check if there's no duplicates -fpod=$( get_running_pod fluentd ) -qs='{"query":{ "bool": { "must": [ {"term":{"kubernetes.event.verb":"ADDED"}}, {"match":{"message":"'"${fpod}"'"}} ] } }, "_source": ["kubernetes.event.metadata.uid", "message"] }' -ids=$( curl_es $esopssvc /.operations.*/_search -X POST -d "$qs" | python -mjson.tool | egrep uid | awk '{print $2}' | sed -e "s/\"//g" ) -for id in $ids; do - os::cmd::expect_success_and_text "curl_es $esopssvc /.operations.*/_count?q=kubernetes.event.metadata.uid:$id | get_count_from_json" "^1\$" -done