Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 47 additions & 26 deletions test/extended/apiserver/graceful_termination.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,21 +149,22 @@ var _ = g.Describe("[sig-api-machinery][Feature:APIServer][Late]", func() {
o.Expect(err).NotTo(o.HaveOccurred())

// set up
// apiserverName -> reader
logReaders := map[string]io.Reader{}
// apiserverName -> filePaths
auditLogsPerServer := map[string][]string{}
results := map[string]int{}

if *controlPlaneTopology == configv1.ExternalTopologyMode {
mgmtClusterOC := exutil.NewHypershiftManagementCLI("default").AsAdmin().WithoutNamespace()
pods, err := mgmtClusterOC.KubeClient().CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{LabelSelector: "hypershift.openshift.io/control-plane-component=kube-apiserver"})
o.Expect(err).To(o.BeNil())
for _, pod := range pods.Items {
fileName, err := mgmtClusterOC.Run("logs", "-n", pod.Namespace, pod.Name, "-c", "audit-logs").OutputToFile(pod.Name + "-audit.log")
filePath, err := mgmtClusterOC.Run("logs", "-n", pod.Namespace, pod.Name, "-c", "audit-logs").OutputToFile(pod.Name + "-audit.log")
o.Expect(err).NotTo(o.HaveOccurred())
reader, err := os.Open(fileName)
reader, err := os.Open(filePath)
o.Expect(err).NotTo(o.HaveOccurred())
defer reader.Close()
logReaders[pod.Namespace+"-"+pod.Name] = reader
err = reader.Close()
o.Expect(err).NotTo(o.HaveOccurred())
auditLogsPerServer[pod.Namespace+"-"+pod.Name] = append(auditLogsPerServer[pod.Namespace+"-"+pod.Name], filePath)
}
} else {
tempDir, err := ioutil.TempDir("", "test.oc-adm-must-gather.")
Expand Down Expand Up @@ -200,40 +201,56 @@ var _ = g.Describe("[sig-api-machinery][Feature:APIServer][Late]", func() {

gzipReader, err := gzip.NewReader(file)
o.Expect(err).NotTo(o.HaveOccurred())
err = gzipReader.Close()
o.Expect(err).NotTo(o.HaveOccurred())

apiServerName := extractAPIServerNameFromAuditFile(fileName)
o.Expect(apiServerName).ToNot(o.BeEmpty())

logReaders[apiServerName] = gzipReader
auditLogsPerServer[apiServerName] = append(auditLogsPerServer[apiServerName], path)

return nil
})
o.Expect(err).NotTo(o.HaveOccurred())
}
}

for apiServerName, reader := range logReaders {
lateRequestCounter := 0
readFile := false
for apiServerName, auditLogs := range auditLogsPerServer {
for _, auditLog := range auditLogs {
lateRequestCounter := 0
func() {
file, err := os.Open(auditLog)
o.Expect(err).NotTo(o.HaveOccurred())
defer file.Close()

scanner := bufio.NewScanner(reader)
for scanner.Scan() {
text := scanner.Text()
if !strings.HasSuffix(text, "}") {
continue // ignore truncated data
}
o.Expect(text).To(o.HavePrefix(`{"kind":"Event",`))
var reader io.Reader
reader = file
if isGzipFileByExtension(file.Name()) {
gzipReader, err := gzip.NewReader(file)
o.Expect(err).NotTo(o.HaveOccurred())
defer gzipReader.Close()
reader = gzipReader
}

if strings.Contains(text, "openshift.io/during-graceful") && strings.Contains(text, "openshift-origin-external-backend-sampler") {
lateRequestCounter++
}
readFile = true
}
o.Expect(readFile).To(o.BeTrue())
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
text := scanner.Text()
if !strings.HasSuffix(text, "}") {
continue // ignore truncated data
}
o.Expect(text).To(o.HavePrefix(`{"kind":"Event",`))

if strings.Contains(text, "openshift.io/during-graceful") && strings.Contains(text, "openshift-origin-external-backend-sampler") {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

have a look at https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-nightly-4.17-e2e-metal-ipi-serial-ovn-ipv6/1820973338286100480

the gathered audit logs (including termination.log) show me

grep -r "through a connection created very late in the graceful termination process" *
2 hits

grep -r "openshift.io/during-graceful"
0 hits

I'm not saying I don't want this. I am noting that something appears off since the audit log isn't showing what you expect.

Copy link
Contributor Author

@p0lyn0mial p0lyn0mial Aug 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm glad you checked it. It turns out that this was changed between versions 4.14 and 4.13 :(

The annotation was replaced by new filters in this pull request, which added a new annotation https://apiserver.k8s.io/shutdown, but the test wasn't updated. :(

Later on, the new annotation was removed and replaced by sending an HTTP response header in this pull request, and no test was added. (?)

This morning, I also realized that there is a test (API LBs follow /readyz of kube-apiserver and stop sending requests) that checks for the LateConnections event.

The issue is that in the run you linked, the event wasn't persisted in the database because the server had issues connecting to the database! For instance:

[-]etcd failed: error getting data from etcd: context deadline exceeded
[-]etcd-readiness failed: error getting data from etcd: context deadline exceeded
etcd retry - counter: 4, lastErrLabel: Unavailable lastError: etcdserver: request timed out, error: context deadline exceeded

That's why we should not rely solely on events.

I'm fine with replacing this test with the one that is going to read the termination log files. (cc @sanchezl)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The openshift.io/during-graceful annotation was brought back in openshift/kubernetes#2077

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, I have checked https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/pr-logs/pull/29006/pull-ci-openshift-origin-master-e2e-metal-ipi-ovn/1833121043204542464/artifacts/e2e-metal-ipi-ovn/gather-audit-logs/artifacts/

the gathered audit logs show me some results (the test didn't fail because it was below the threshold):

 grep -r "openshift.io/during-graceful"
./master-0-audit-2024-09-09T14-17-24.568.log:{"kind":"Event","apiVersion":"audit.k8s.io/v1","level":"Metadata","auditID":"58f32bd7-808e-42f5-a08d-5f731844a2dc","stage":"ResponseComplete","requestURI":"/apis/coordination.k8s.io/v1/namespaces/openshift-marketplace/leases/marketplace-operator-lock","verb":"get","user":{"username":"system:serviceaccount:openshift-marketplace:marketplace-operator","uid":"a8ab4437-6ff2-4947-b718-58c0010fc837","groups":["system:serviceaccounts","system:serviceaccounts:openshift-marketplace","system:authenticated"],"extra":{"authentication.kubernetes.io/credential-id":["JTI=2b6ed9b6-755b-4458-86d7-a80964178dc1"],"authentication.kubernetes.io/node-name":["master-0"],"authentication.kubernetes.io/node-uid":["0ed94f1d-c7c5-4be0-9b82-b1dff79b1ec6"],"authentication.kubernetes.io/pod-name":["marketplace-operator-85c678694c-qpqpf"],"authentication.kubernetes.io/pod-uid":["d9ea82a1-1caf-4657-83a3-7b57e2acd59f"]}},"sourceIPs":["10.128.0.23"],"userAgent":"marketplace-operator/v0.0.0 (linux/amd64) kubernetes/$Format","objectRef":{"resource":"leases","namespace":"openshift-marketplace","name":"marketplace-operator-lock","apiGroup":"coordination.k8s.io","apiVersion":"v1"},"responseStatus":{"metadata":{},"status":"Failure","message":"context deadline exceeded","code":500},"requestReceivedTimestamp":"2024-09-09T13:26:41.930788Z","stageTimestamp":"2024-09-09T13:27:41.934549Z","annotations":{"apiserver.latency.k8s.io/etcd":"1m0.000435665s","apiserver.latency.k8s.io/response-write":"5.726µs","apiserver.latency.k8s.io/serialize-response-object":"2.481828ms","apiserver.latency.k8s.io/total":"1m0.003761653s","authorization.k8s.io/decision":"allow","authorization.k8s.io/reason":"RBAC: allowed by RoleBinding \"marketplace-operator/openshift-marketplace\" of Role \"marketplace-operator\" to ServiceAccount \"marketplace-operator/openshift-marketplace\"","openshift.io/during-graceful":"loopback=false,,readyz=false"}}

./master-1-audit-2024-09-09T14-17-59.588.log:{"kind":"Event","apiVersion":"audit.k8s.io/v1","level":"Metadata","auditID":"775c595d-125c-46a5-b6b2-2ab65a12ffeb","stage":"ResponseComplete","requestURI":"/apis/operator.openshift.io/v1/authentications/cluster/status","verb":"update","user":{"username":"system:serviceaccount:openshift-authentication-operator:authentication-operator","uid":"36d42c5c-bfe9-4fff-8fef-055db4f6df4d","groups":["system:serviceaccounts","system:serviceaccounts:openshift-authentication-operator","system:authenticated"],"extra":{"authentication.kubernetes.io/credential-id":["JTI=ecfaae00-a5d0-4ca3-b3dd-91edcf397651"],"authentication.kubernetes.io/node-name":["master-0"],"authentication.kubernetes.io/node-uid":["0ed94f1d-c7c5-4be0-9b82-b1dff79b1ec6"],"authentication.kubernetes.io/pod-name":["authentication-operator-7df766656f-g7c6p"],"authentication.kubernetes.io/pod-uid":["b19ee6fa-872f-4c58-bad2-f8a897a165a6"]}},"sourceIPs":["192.168.111.20"],"userAgent":"authentication-operator/v0.0.0 (linux/amd64) kubernetes/$Format","objectRef":{"resource":"authentications","name":"cluster","uid":"c6b81673-1467-411f-a9d2-372101ab73fc","apiGroup":"operator.openshift.io","apiVersion":"v1","resourceVersion":"20474","subresource":"status"},"responseStatus":{"metadata":{},"status":"Failure","message":"Timeout: request did not complete within requested timeout - context deadline exceeded","reason":"Timeout","details":{},"code":504},"requestReceivedTimestamp":"2024-09-09T13:34:48.835708Z","stageTimestamp":"2024-09-09T13:35:22.851223Z","annotations":{"apiserver.latency.k8s.io/etcd":"33.090109659s","apiserver.latency.k8s.io/response-write":"2.871µs","apiserver.latency.k8s.io/serialize-response-object":"141.511µs","apiserver.latency.k8s.io/total":"34.015514604s","authorization.k8s.io/decision":"allow","authorization.k8s.io/reason":"RBAC: allowed by ClusterRoleBinding \"system:openshift:operator:authentication\" of ClusterRole \"cluster-admin\" to ServiceAccount \"authentication-operator/openshift-authentication-operator\"","openshift.io/during-graceful":"loopback=false,,readyz=false"}}

./master-1-audit-2024-09-09T14-17-59.588.log:{"kind":"Event","apiVersion":"audit.k8s.io/v1","level":"Metadata","auditID":"f155e7ff-b027-44c6-8fdb-6a3c6bd0e2d7","stage":"ResponseComplete","requestURI":"/apis/coordination.k8s.io/v1/namespaces/openshift-marketplace/leases/marketplace-operator-lock","verb":"get","user":{"username":"system:serviceaccount:openshift-marketplace:marketplace-operator","uid":"a8ab4437-6ff2-4947-b718-58c0010fc837","groups":["system:serviceaccounts","system:serviceaccounts:openshift-marketplace","system:authenticated"],"extra":{"authentication.kubernetes.io/credential-id":["JTI=2b6ed9b6-755b-4458-86d7-a80964178dc1"],"authentication.kubernetes.io/node-name":["master-0"],"authentication.kubernetes.io/node-uid":["0ed94f1d-c7c5-4be0-9b82-b1dff79b1ec6"],"authentication.kubernetes.io/pod-name":["marketplace-operator-85c678694c-qpqpf"],"authentication.kubernetes.io/pod-uid":["d9ea82a1-1caf-4657-83a3-7b57e2acd59f"]}},"sourceIPs":["192.168.111.20"],"userAgent":"marketplace-operator/v0.0.0 (linux/amd64) kubernetes/$Format","objectRef":{"resource":"leases","namespace":"openshift-marketplace","name":"marketplace-operator-lock","apiGroup":"coordination.k8s.io","apiVersion":"v1"},"responseStatus":{"metadata":{},"code":200},"requestReceivedTimestamp":"2024-09-09T13:34:39.919997Z","stageTimestamp":"2024-09-09T13:35:26.540072Z","annotations":{"apiserver.latency.k8s.io/etcd":"44.402019535s","apiserver.latency.k8s.io/response-write":"5.499µs","apiserver.latency.k8s.io/serialize-response-object":"198.405µs","apiserver.latency.k8s.io/total":"46.62007528s","apiserver.latency.k8s.io/transform-response-object":"247ns","authorization.k8s.io/decision":"allow","authorization.k8s.io/reason":"RBAC: allowed by RoleBinding \"marketplace-operator/openshift-marketplace\" of Role \"marketplace-operator\" to ServiceAccount \"marketplace-operator/openshift-marketplace\"","openshift.io/during-graceful":"loopback=false,,readyz=false"}}

lateRequestCounter++
}
}
o.Expect(scanner.Err()).NotTo(o.HaveOccurred())
}()

if lateRequestCounter > 0 {
previousLateRequestCounter, _ := results[apiServerName]
results[apiServerName] = previousLateRequestCounter + lateRequestCounter
if lateRequestCounter > 0 {
previousLateRequestCounter, _ := results[apiServerName]
results[apiServerName] = previousLateRequestCounter + lateRequestCounter
}
}
}

Expand All @@ -251,6 +268,10 @@ var _ = g.Describe("[sig-api-machinery][Feature:APIServer][Late]", func() {
})
})

func isGzipFileByExtension(fileName string) bool {
return strings.HasSuffix(fileName, ".gz")
}

func extractAPIServerNameFromAuditFile(auditFileName string) string {
pos := strings.Index(auditFileName, "-audit")
if pos == -1 {
Expand Down