diff --git a/pkg/monitortests/node/kubeletlogcollector/node.go b/pkg/monitortests/node/kubeletlogcollector/node.go index 2eea816d4129..9c83e5861a78 100644 --- a/pkg/monitortests/node/kubeletlogcollector/node.go +++ b/pkg/monitortests/node/kubeletlogcollector/node.go @@ -5,6 +5,7 @@ import ( "bytes" "context" "fmt" + "io" "io/ioutil" "os" "regexp" @@ -14,12 +15,16 @@ import ( "sync" "time" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" utilerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/wait" "github.com/openshift/origin/pkg/monitor/monitorapi" "github.com/openshift/origin/pkg/monitortestlibrary/utility" "github.com/openshift/origin/pkg/monitortests/kubeapiserver/staticpodinstall/kubeletlogparser" + "github.com/openshift/origin/test/extended/util/image" "k8s.io/client-go/kubernetes" ) @@ -42,7 +47,7 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface, defer wg.Done() // TODO limit by begin/end here instead of post-processing - nodeLogs, err := getNodeLog(ctx, kubeClient, nodeName, "kubelet") + nodeLogs, err := GetNodeLog(ctx, kubeClient, nodeName, "kubelet") if err != nil { fmt.Fprintf(os.Stderr, "Error getting node logs from %s: %s", nodeName, err.Error()) errCh <- err @@ -50,7 +55,7 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface, } newEvents := eventsFromKubeletLogs(nodeName, nodeLogs) - ovsVswitchdLogs, err := getNodeLog(ctx, kubeClient, nodeName, "ovs-vswitchd") + ovsVswitchdLogs, err := GetNodeLog(ctx, kubeClient, nodeName, "ovs-vswitchd") if err != nil { fmt.Fprintf(os.Stderr, "Error getting node ovs-vswitchd logs from %s: %s", nodeName, err.Error()) errCh <- err @@ -58,7 +63,7 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface, } newOVSEvents := intervalsFromOVSVswitchdLogs(nodeName, ovsVswitchdLogs) - networkManagerLogs, err := getNodeLog(ctx, kubeClient, nodeName, "NetworkManager") + networkManagerLogs, err := GetNodeLog(ctx, kubeClient, nodeName, "NetworkManager") if err != nil { fmt.Fprintf(os.Stderr, "Error getting node NetworkManager logs from %s: %s", nodeName, err.Error()) errCh <- err @@ -66,7 +71,7 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface, } newNetworkManagerIntervals := intervalsFromNetworkManagerLogs(nodeName, networkManagerLogs) - systemdCoreDumpLogs, err := getNodeLog(ctx, kubeClient, nodeName, "systemd-coredump") + systemdCoreDumpLogs, err := GetNodeLog(ctx, kubeClient, nodeName, "systemd-coredump") if err != nil { fmt.Fprintf(os.Stderr, "Error getting node systemd-coredump logs from %s: %s", nodeName, err.Error()) errCh <- err @@ -692,9 +697,10 @@ func commonErrorInterval(nodeName, logLine string, messageExp *regexp.Regexp, re } } -// getNodeLog returns logs for a particular systemd service on a given node. +// GetNodeLog returns logs for a particular systemd service on a given node. // We're count on these logs to fit into some reasonable memory size. -func getNodeLog(ctx context.Context, client kubernetes.Interface, nodeName, systemdServiceName string) ([]byte, error) { +// If the kubelet API is unavailable, it falls back to using a debug pod. +func GetNodeLog(ctx context.Context, client kubernetes.Interface, nodeName, systemdServiceName string) ([]byte, error) { path := client.CoreV1().RESTClient().Get(). Namespace("").Name(nodeName). Resource("nodes").SubResource("proxy", "logs").Suffix("journal").URL().Path @@ -706,9 +712,194 @@ func getNodeLog(ctx context.Context, client kubernetes.Interface, nodeName, syst in, err := req.Stream(ctx) if err != nil { - return nil, err + // Kubelet API might be down, try fallback method using debug pod + fmt.Fprintf(os.Stderr, "Failed to get logs via kubelet API for %s on node %s: %v. Trying fallback method...\n", + systemdServiceName, nodeName, err) + return getNodeLogViaDebugPod(ctx, client, nodeName, systemdServiceName) } defer in.Close() return ioutil.ReadAll(in) } + +// getNodeLogViaDebugPod collects systemd service logs using a privileged debug pod. +// This method works even when kubelet is down because it directly accesses the host filesystem. +func getNodeLogViaDebugPod(ctx context.Context, client kubernetes.Interface, nodeName, systemdServiceName string) ([]byte, error) { + const debugNamespace = "openshift-e2e-node-log-collector" + + // Ensure namespace exists + if err := ensureNamespace(ctx, client, debugNamespace); err != nil { + return nil, fmt.Errorf("failed to ensure namespace %s: %w", debugNamespace, err) + } + + // Create debug pod + podName := fmt.Sprintf("node-log-%s-%d", nodeName, time.Now().Unix()) + _, err := createDebugPod(ctx, client, debugNamespace, podName, nodeName, systemdServiceName) + if err != nil { + return nil, fmt.Errorf("failed to create debug pod: %w", err) + } + + // Ensure pod cleanup + defer func() { + deleteCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + client.CoreV1().Pods(debugNamespace).Delete(deleteCtx, podName, metav1.DeleteOptions{}) + }() + + // Wait for pod to complete + if err := waitForPodCompletion(ctx, client, debugNamespace, podName); err != nil { + return nil, fmt.Errorf("pod did not complete: %w", err) + } + + // Get logs from the pod + logs, err := getPodLogs(ctx, client, debugNamespace, podName) + if err != nil { + return nil, fmt.Errorf("failed to get pod logs: %w", err) + } + + return logs, nil +} + +// ensureNamespace creates the namespace if it doesn't exist +func ensureNamespace(ctx context.Context, client kubernetes.Interface, namespace string) error { + _, err := client.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{}) + if err == nil { + return nil + } + + if !errors.IsNotFound(err) { + return err + } + + // Create namespace with privileged labels + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespace, + Labels: map[string]string{ + "pod-security.kubernetes.io/enforce": "privileged", + "pod-security.kubernetes.io/audit": "privileged", + "pod-security.kubernetes.io/warn": "privileged", + "security.openshift.io/scc.podSecurityLabelSync": "false", + }, + }, + } + _, err = client.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{}) + return err +} + +// createDebugPod creates a privileged pod that runs journalctl to collect logs +func createDebugPod(ctx context.Context, client kubernetes.Interface, namespace, podName, nodeName, systemdServiceName string) (*corev1.Pod, error) { + privileged := true + hostPathDirectory := corev1.HostPathDirectory + zero := int64(0) + + // Build journalctl command to collect logs from the last 24 hours + command := fmt.Sprintf("chroot /host journalctl --utc --no-pager -u %s --since=-1d", systemdServiceName) + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: namespace, + Labels: map[string]string{ + "app": "node-log-collector", + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + HostPID: true, + RestartPolicy: corev1.RestartPolicyNever, + Containers: []corev1.Container{ + { + Name: "log-collector", + Image: image.ShellImage(), + Command: []string{ + "/bin/bash", + "-c", + command, + }, + SecurityContext: &corev1.SecurityContext{ + Privileged: &privileged, + RunAsUser: &zero, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "host", + MountPath: "/host", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "host", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/", + Type: &hostPathDirectory, + }, + }, + }, + }, + Tolerations: []corev1.Toleration{ + { + Operator: corev1.TolerationOpExists, + }, + }, + }, + } + + return client.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{}) +} + +// waitForPodCompletion waits for the pod to reach completed or failed state +func waitForPodCompletion(ctx context.Context, client kubernetes.Interface, namespace, podName string) error { + return wait.PollUntilContextTimeout(ctx, 2*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { + pod, err := client.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + return false, err + } + + switch pod.Status.Phase { + case corev1.PodSucceeded: + return true, nil + case corev1.PodFailed: + return false, fmt.Errorf("pod failed with reason: %s, message: %s", + pod.Status.Reason, pod.Status.Message) + case corev1.PodPending, corev1.PodRunning: + // Check if container has terminated + if len(pod.Status.ContainerStatuses) > 0 { + containerStatus := pod.Status.ContainerStatuses[0] + if containerStatus.State.Terminated != nil { + if containerStatus.State.Terminated.ExitCode == 0 { + return true, nil + } + return false, fmt.Errorf("container exited with code %d: %s", + containerStatus.State.Terminated.ExitCode, + containerStatus.State.Terminated.Message) + } + } + return false, nil + default: + return false, fmt.Errorf("unexpected pod phase: %s", pod.Status.Phase) + } + }) +} + +// getPodLogs retrieves the logs from a completed pod +func getPodLogs(ctx context.Context, client kubernetes.Interface, namespace, podName string) ([]byte, error) { + req := client.CoreV1().Pods(namespace).GetLogs(podName, &corev1.PodLogOptions{}) + + podLogs, err := req.Stream(ctx) + if err != nil { + return nil, err + } + defer podLogs.Close() + + buf := new(bytes.Buffer) + _, err = io.Copy(buf, podLogs) + if err != nil { + return nil, err + } + + return buf.Bytes(), nil +} diff --git a/pkg/monitortests/node/logcollectorservice/logcollector-server.sh b/pkg/monitortests/node/logcollectorservice/logcollector-server.sh new file mode 100644 index 000000000000..11e1cbc0f9ba --- /dev/null +++ b/pkg/monitortests/node/logcollectorservice/logcollector-server.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# OpenShift Node Log Collector Service +# This service starts early in the boot process and provides HTTP access to systemd logs +# even when kubelet is down. + +PORT=9333 +LOG_LINES=50 + +# Function to get the last N lines of a systemd service +get_service_logs() { + local service=$1 + local lines=${2:-$LOG_LINES} + + # Use journalctl to get logs, with fallback to "not found" if service doesn't exist + journalctl -u "${service}.service" --no-pager -n "$lines" 2>/dev/null || \ + echo "Service ${service} not found or no logs available" +} + +# Simple HTTP server using netcat +handle_request() { + local request + read -r request + + # Parse the request path + local path=$(echo "$request" | awk '{print $2}') + + case "$path" in + /logs/kubelet) + echo -ne "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\n\r\n" + get_service_logs "kubelet" + ;; + /logs/crio) + echo -ne "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\n\r\n" + get_service_logs "crio" + ;; + /logs/both) + echo -ne "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\n\r\n" + echo "=== KUBELET LOGS ===" + get_service_logs "kubelet" + echo "" + echo "=== CRIO LOGS ===" + get_service_logs "crio" + ;; + /health) + echo -ne "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\n\r\n" + echo "OK" + ;; + *) + echo -ne "HTTP/1.1 404 Not Found\r\nContent-Type: text/plain\r\n\r\n" + echo "Available endpoints: /logs/kubelet, /logs/crio, /logs/both, /health" + ;; + esac +} + +# Main server loop +echo "Starting OpenShift Node Log Collector Service on port $PORT" +while true; do + handle_request | nc -l -p $PORT -q 1 +done diff --git a/pkg/monitortests/node/logcollectorservice/monitortest.go b/pkg/monitortests/node/logcollectorservice/monitortest.go new file mode 100644 index 000000000000..3737eda50da5 --- /dev/null +++ b/pkg/monitortests/node/logcollectorservice/monitortest.go @@ -0,0 +1,200 @@ +package logcollectorservice + +import ( + "context" + "fmt" + "io/ioutil" + "net/http" + "time" + + exutil "github.com/openshift/origin/test/extended/util" + + mcfgv1 "github.com/openshift/api/machineconfiguration/v1" + mcfgclient "github.com/openshift/client-go/machineconfiguration/clientset/versioned" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "sigs.k8s.io/yaml" +) + +const ( + logCollectorPort = 9333 +) + +// FetchLogsFromService retrieves logs from the log collector service running on a node. +// This function can be called even when kubelet is down because the service starts +// before kubelet and listens on port 9333. +func FetchLogsFromService(ctx context.Context, kubeClient kubernetes.Interface, nodeName, logType string) (string, error) { + // Get the node to find its internal IP + node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return "", fmt.Errorf("failed to get node %s: %w", nodeName, err) + } + + var nodeIP string + for _, addr := range node.Status.Addresses { + if addr.Type == corev1.NodeInternalIP { + nodeIP = addr.Address + break + } + } + + if nodeIP == "" { + return "", fmt.Errorf("no internal IP found for node %s", nodeName) + } + + // Construct the URL based on log type + var endpoint string + switch logType { + case "kubelet": + endpoint = fmt.Sprintf("http://%s:%d/logs/kubelet", nodeIP, logCollectorPort) + case "crio": + endpoint = fmt.Sprintf("http://%s:%d/logs/crio", nodeIP, logCollectorPort) + case "both": + endpoint = fmt.Sprintf("http://%s:%d/logs/both", nodeIP, logCollectorPort) + default: + return "", fmt.Errorf("unknown log type: %s (valid: kubelet, crio, both)", logType) + } + + // Create HTTP client with timeout + client := &http.Client{ + Timeout: 30 * time.Second, + } + + resp, err := client.Get(endpoint) + if err != nil { + return "", fmt.Errorf("failed to fetch logs from %s: %w", endpoint, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("log collector service returned status %d", resp.StatusCode) + } + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("failed to read response body: %w", err) + } + + return string(body), nil +} + +// IsServiceHealthy checks if the log collector service is running on a node +func IsServiceHealthy(ctx context.Context, kubeClient kubernetes.Interface, nodeName string) bool { + node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + + var nodeIP string + for _, addr := range node.Status.Addresses { + if addr.Type == corev1.NodeInternalIP { + nodeIP = addr.Address + break + } + } + + if nodeIP == "" { + return false + } + + healthURL := fmt.Sprintf("http://%s:%d/health", nodeIP, logCollectorPort) + client := &http.Client{Timeout: 5 * time.Second} + + resp, err := client.Get(healthURL) + if err != nil { + return false + } + defer resp.Body.Close() + + return resp.StatusCode == http.StatusOK +} + +// EnsureLogCollectorForPool deploys the log collector service to a specific MachineConfigPool. +// This function can be called from tests after they create custom MachineConfigPools. +// It's idempotent - if the MachineConfig already exists, it will skip creation. +func EnsureLogCollectorForPool(ctx context.Context, adminRESTConfig *rest.Config, poolName string) error { + mcfgClient, err := mcfgclient.NewForConfig(adminRESTConfig) + if err != nil { + return fmt.Errorf("failed to create machine config client: %w", err) + } + + // Load the template MachineConfig from the YAML file + mcPath := exutil.FixturePath("testdata", "machine_config", "machineconfig", "99-node-log-collector.yaml") + + data, err := ioutil.ReadFile(mcPath) + if err != nil { + return fmt.Errorf("failed to read MachineConfig file %s: %w", mcPath, err) + } + + // Parse the YAML into a MachineConfig object + var mc mcfgv1.MachineConfig + if err := yaml.Unmarshal(data, &mc); err != nil { + return fmt.Errorf("failed to unmarshal MachineConfig: %w", err) + } + + // Customize the MachineConfig for this pool + mcName := fmt.Sprintf("99-%s-node-log-collector", poolName) + mc.ObjectMeta.Name = mcName + mc.ObjectMeta.Labels = map[string]string{ + "machineconfiguration.openshift.io/role": poolName, + } + + // Check if MachineConfig already exists + existing, err := mcfgClient.MachineconfigurationV1().MachineConfigs().Get(ctx, mcName, metav1.GetOptions{}) + if err == nil { + // MachineConfig already exists + fmt.Printf("MachineConfig %s already exists (created at %v), skipping creation\n", + mcName, existing.CreationTimestamp) + return nil + } + + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to check if MachineConfig exists: %w", err) + } + + // Create the MachineConfig + fmt.Printf("Creating MachineConfig %s for pool %s on port %d\n", + mcName, poolName, logCollectorPort) + + _, err = mcfgClient.MachineconfigurationV1().MachineConfigs().Create( + ctx, + &mc, + metav1.CreateOptions{}, + ) + + if err != nil { + return fmt.Errorf("failed to create MachineConfig: %w", err) + } + + fmt.Printf("Successfully created MachineConfig %s for pool %s\n", mcName, poolName) + + // Wait for the MachineConfigPool to start updating + err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 2*time.Minute, true, + func(ctx context.Context) (bool, error) { + mcp, err := mcfgClient.MachineconfigurationV1().MachineConfigPools().Get(ctx, poolName, metav1.GetOptions{}) + if err != nil { + return false, err + } + + // Check if MCP is updating + for _, condition := range mcp.Status.Conditions { + if condition.Type == "Updating" && condition.Status == "True" { + fmt.Printf("MachineConfigPool '%s' is now updating\n", poolName) + return true, nil + } + } + + return false, nil + }) + + if err != nil { + // Don't fail on timeout - the MCP might update later + fmt.Printf("MachineConfigPool '%s' update wait timed out (this is OK): %v\n", poolName, err) + } + + return nil +} diff --git a/pkg/monitortests/node/logcollectorservice/node-log-collector.service b/pkg/monitortests/node/logcollectorservice/node-log-collector.service new file mode 100644 index 000000000000..e9d2f0044ae7 --- /dev/null +++ b/pkg/monitortests/node/logcollectorservice/node-log-collector.service @@ -0,0 +1,26 @@ +[Unit] +Description=OpenShift Node Log Collector Service +Documentation=https://github.com/openshift/origin +# Start after basic network is available but before kubelet and crio +After=network.target systemd-journald.service +Before=kubelet.service crio.service +# Don't fail if kubelet or crio fails - we want to be available for debugging +DefaultDependencies=no + +[Service] +Type=simple +ExecStart=/usr/local/bin/node-log-collector.sh +Restart=always +RestartSec=5 +StandardOutput=journal +StandardError=journal + +# Run as root to access journal logs +User=root + +# Security settings (minimal restrictions since we need journal access) +NoNewPrivileges=true +PrivateTmp=true + +[Install] +WantedBy=multi-user.target diff --git a/test/extended/machine_config/pinnedimages.go b/test/extended/machine_config/pinnedimages.go index 56b7fffb0e30..0dac8080b637 100644 --- a/test/extended/machine_config/pinnedimages.go +++ b/test/extended/machine_config/pinnedimages.go @@ -13,6 +13,8 @@ import ( mcfgv1 "github.com/openshift/api/machineconfiguration/v1" mcClient "github.com/openshift/client-go/machineconfiguration/clientset/versioned" + "github.com/openshift/origin/pkg/monitortests/node/kubeletlogcollector" + "github.com/openshift/origin/pkg/monitortests/node/logcollectorservice" exutil "github.com/openshift/origin/test/extended/util" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -91,6 +93,11 @@ var _ = g.Describe("[Suite:openshift/machine-config-operator/disruptive][sig-mco err = oc.Run("apply").Args("-f", mcpFixture).Execute() o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error creating MCP `custom`: %v", pisFixture)) + // Deploy log collector service to the custom pool + ctx := context.TODO() + err = logcollectorservice.EnsureLogCollectorForPool(ctx, oc.KubeFramework().ClientConfig(), "custom") + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error deploying log collector to custom pool: %v", err)) + // Add node to custom MCP & wait for the node to be ready in the MCP optedNodes, err := addWorkerNodesToCustomPool(oc, kubeClient, 1, "custom") o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error adding node to `custom` MCP: %v", err)) @@ -163,6 +170,11 @@ var _ = g.Describe("[Suite:openshift/machine-config-operator/disruptive][sig-mco err = oc.Run("apply").Args("-f", mcpFixture).Execute() o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error creating MCP `custom`: %v", pisFixture)) + // Deploy log collector service to the custom pool + ctx := context.TODO() + err = logcollectorservice.EnsureLogCollectorForPool(ctx, oc.KubeFramework().ClientConfig(), "custom") + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error deploying log collector to custom pool: %v", err)) + // Add node to custom MCP & wait for the node to be ready in the MCP optedNodes, err := addWorkerNodesToCustomPool(oc, kubeClient, 1, "custom") o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error adding node to `custom` MCP: %v", err)) @@ -311,6 +323,11 @@ var _ = g.Describe("[Suite:openshift/machine-config-operator/disruptive][sig-mco err = oc.Run("apply").Args("-f", mcpFixture).Execute() o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error creating MCP `custom`: %v", pisFixture)) + // Deploy log collector service to the custom pool + ctx := context.TODO() + err = logcollectorservice.EnsureLogCollectorForPool(ctx, oc.KubeFramework().ClientConfig(), "custom") + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error deploying log collector to custom pool: %v", err)) + // Add node to custom MCP & wait for the node to be ready in the MCP optedNodes, err := addWorkerNodesToCustomPool(oc, kubeClient, 1, "custom") o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error adding node to `custom` MCP: %v", err)) @@ -562,34 +579,119 @@ func waitTillImageGC(oc *exutil.CLI, nodeName, imageName string) { }, 10*time.Minute, 10*time.Second).Should(o.BeTrue(), "Timed out waiting for Node '%s' to garbage collect '%s'.", nodeName, imageName) } -// `waitForReboot` waits for up to 5 minutes for the input node to start a reboot and then up to 15 -// minutes for the node to complete its reboot. +// `waitForReboot` waits for up to 5 minutes for the input node to start a reboot and then up to 45 +// minutes for the node to complete its reboot. If the reboot fails, kubelet and crio logs are collected. func waitForReboot(kubeClient *kubernetes.Clientset, nodeName string) { - o.Eventually(func() bool { + // Wait for node to enter reboot state + err := wait.PollImmediate(10*time.Second, 5*time.Minute, func() (bool, error) { node, err := kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) if err != nil { framework.Logf("Failed to grab Node '%s', error :%s", nodeName, err) - return false + return false, nil } if node.Annotations["machineconfiguration.openshift.io/state"] == "Working" { framework.Logf("Node '%s' has entered reboot", nodeName) - return true + return true, nil } - return false - }, 5*time.Minute, 10*time.Second).Should(o.BeTrue(), "Timed out waiting for Node '%s' to start reboot.", nodeName) + return false, nil + }) - o.Eventually(func() bool { + if err != nil { + // Collect kubelet and crio logs to help debug why reboot didn't start + framework.Logf("Node '%s' failed to start reboot, collecting diagnostic logs...", nodeName) + collectNodeSystemLogs(kubeClient, nodeName, "reboot-start-failure") + o.Expect(err).NotTo(o.HaveOccurred(), "Timed out waiting for Node '%s' to start reboot.", nodeName) + } + + // Wait for node to complete reboot + err = wait.PollImmediate(10*time.Second, 15*time.Minute, func() (bool, error) { node, err := kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) if err != nil { framework.Logf("Failed to grab Node '%s', error :%s", nodeName, err) - return false + return false, nil } if node.Annotations["machineconfiguration.openshift.io/state"] == "Done" && len(node.Spec.Taints) == 0 { framework.Logf("Node '%s' has finished reboot", nodeName) - return true + return true, nil } - return false - }, 15*time.Minute, 10*time.Second).Should(o.BeTrue(), "Timed out waiting for Node '%s' to finish reboot.", nodeName) + return false, nil + }) + + if err != nil { + // Collect kubelet and crio logs to help debug why reboot didn't complete + framework.Logf("Node '%s' failed to complete reboot, collecting diagnostic logs...", nodeName) + collectNodeSystemLogs(kubeClient, nodeName, "reboot-complete-failure") + o.Expect(err).NotTo(o.HaveOccurred(), "Timed out waiting for Node '%s' to finish reboot.", nodeName) + } +} + +// collectNodeSystemLogs collects kubelet and crio logs from a node when operations fail. +// It tries three methods in order: +// 1. Log Collector Service (port 9333) - works even before kubelet starts +// 2. kubeletlogcollector.GetNodeLog - uses kubelet API with debug pod fallback +// 3. Direct debug pod - last resort +func collectNodeSystemLogs(kubeClient *kubernetes.Clientset, nodeName, phase string) { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) + defer cancel() + + // Try to collect both kubelet and crio logs together first using the log collector service + framework.Logf("Attempting to collect logs from log collector service on node %s (phase: %s)...", nodeName, phase) + + logs, err := logcollectorservice.FetchLogsFromService(ctx, kubeClient, nodeName, "both") + if err == nil && logs != "" { + framework.Logf("Successfully collected logs from log collector service") + + // Print last 50 lines to test output + lines := strings.Split(logs, "\n") + startLine := 0 + if len(lines) > 50 { + startLine = len(lines) - 50 + } + framework.Logf("Last 50 lines from log collector service on node %s:\n%s", + nodeName, strings.Join(lines[startLine:], "\n")) + + // Save full logs to /tmp + logFileName := fmt.Sprintf("/tmp/node-%s-combined-%s-%d.log", + nodeName, phase, time.Now().Unix()) + if err := ioutil.WriteFile(logFileName, []byte(logs), 0644); err != nil { + framework.Logf("Failed to write combined logs to %s: %v", logFileName, err) + } else { + framework.Logf("Saved full combined logs to %s (%d bytes)", logFileName, len(logs)) + } + return + } + + framework.Logf("Log collector service unavailable (%v), falling back to kubelet API method...", err) + + // Fallback to kubeletlogcollector.GetNodeLog (which has its own debug pod fallback) + services := []string{"kubelet", "crio"} + for _, service := range services { + framework.Logf("Collecting %s logs from node %s using kubelet API fallback method...", service, nodeName) + + logs, err := kubeletlogcollector.GetNodeLog(ctx, kubeClient, nodeName, service) + if err != nil { + framework.Logf("Failed to collect %s logs from node %s: %v", service, nodeName, err) + continue + } + + // Print last 50 lines to test output for immediate visibility + lines := strings.Split(string(logs), "\n") + startLine := 0 + if len(lines) > 50 { + startLine = len(lines) - 50 + } + framework.Logf("Last 50 lines of %s logs from node %s:\n%s", + service, nodeName, strings.Join(lines[startLine:], "\n")) + + // Also save full logs to /tmp for post-mortem analysis + logFileName := fmt.Sprintf("/tmp/node-%s-%s-%s-%d.log", + nodeName, service, phase, time.Now().Unix()) + if err := ioutil.WriteFile(logFileName, logs, 0644); err != nil { + framework.Logf("Failed to write %s logs to %s: %v", service, logFileName, err) + } else { + framework.Logf("Saved full %s logs to %s (%d bytes)", service, logFileName, len(logs)) + } + } } // `waitTillNodeReadyWithConfig` loops for up to 5 minutes to check whether the input node reaches diff --git a/test/extended/testdata/machine_config/machineconfig/99-node-log-collector.yaml b/test/extended/testdata/machine_config/machineconfig/99-node-log-collector.yaml new file mode 100644 index 000000000000..b38243fc3b72 --- /dev/null +++ b/test/extended/testdata/machine_config/machineconfig/99-node-log-collector.yaml @@ -0,0 +1,41 @@ +apiVersion: machineconfiguration.openshift.io/v1 +kind: MachineConfig +metadata: + labels: + machineconfiguration.openshift.io/role: worker + name: 99-node-log-collector-service +spec: + config: + ignition: + version: 3.2.0 + storage: + files: + - contents: + source: data:,%23%21/bin/bash%0A%23%20OpenShift%20Node%20Log%20Collector%20Service%0A%23%20This%20service%20starts%20early%20in%20the%20boot%20process%20and%20provides%20HTTP%20access%20to%20systemd%20logs%0A%23%20even%20when%20kubelet%20is%20down.%0A%0APORT%3D9333%0ALOG_LINES%3D50%0A%0A%23%20Function%20to%20get%20the%20last%20N%20lines%20of%20a%20systemd%20service%0Aget_service_logs%28%29%20%7B%0A%20%20%20%20local%20service%3D%241%0A%20%20%20%20local%20lines%3D%24%7B2%3A-%24LOG_LINES%7D%0A%0A%20%20%20%20%23%20Use%20journalctl%20to%20get%20logs%2C%20with%20fallback%20to%20%22not%20found%22%20if%20service%20doesn%27t%20exist%0A%20%20%20%20journalctl%20-u%20%22%24%7Bservice%7D.service%22%20--no-pager%20-n%20%22%24lines%22%202%3E/dev/null%20%7C%7C%20%5C%0A%20%20%20%20%20%20%20%20echo%20%22Service%20%24%7Bservice%7D%20not%20found%20or%20no%20logs%20available%22%0A%7D%0A%0A%23%20Simple%20HTTP%20server%20using%20netcat%0Ahandle_request%28%29%20%7B%0A%20%20%20%20local%20request%0A%20%20%20%20read%20-r%20request%0A%0A%20%20%20%20%23%20Parse%20the%20request%20path%0A%20%20%20%20local%20path%3D%24%28echo%20%22%24request%22%20%7C%20awk%20%27%7Bprint%20%242%7D%27%29%0A%0A%20%20%20%20case%20%22%24path%22%20in%0A%20%20%20%20%20%20%20%20/logs/kubelet%29%0A%20%20%20%20%20%20%20%20%20%20%20%20echo%20-ne%20%22HTTP/1.1%20200%20OK%5Cr%5CnContent-Type%3A%20text/plain%5Cr%5Cn%5Cr%5Cn%22%0A%20%20%20%20%20%20%20%20%20%20%20%20get_service_logs%20%22kubelet%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%3B%3B%0A%20%20%20%20%20%20%20%20/logs/crio%29%0A%20%20%20%20%20%20%20%20%20%20%20%20echo%20-ne%20%22HTTP/1.1%20200%20OK%5Cr%5CnContent-Type%3A%20text/plain%5Cr%5Cn%5Cr%5Cn%22%0A%20%20%20%20%20%20%20%20%20%20%20%20get_service_logs%20%22crio%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%3B%3B%0A%20%20%20%20%20%20%20%20/logs/both%29%0A%20%20%20%20%20%20%20%20%20%20%20%20echo%20-ne%20%22HTTP/1.1%20200%20OK%5Cr%5CnContent-Type%3A%20text/plain%5Cr%5Cn%5Cr%5Cn%22%0A%20%20%20%20%20%20%20%20%20%20%20%20echo%20%22%3D%3D%3D%20KUBELET%20LOGS%20%3D%3D%3D%22%0A%20%20%20%20%20%20%20%20%20%20%20%20get_service_logs%20%22kubelet%22%0A%20%20%20%20%20%20%20%20%20%20%20%20echo%20%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20echo%20%22%3D%3D%3D%20CRIO%20LOGS%20%3D%3D%3D%22%0A%20%20%20%20%20%20%20%20%20%20%20%20get_service_logs%20%22crio%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%3B%3B%0A%20%20%20%20%20%20%20%20/health%29%0A%20%20%20%20%20%20%20%20%20%20%20%20echo%20-ne%20%22HTTP/1.1%20200%20OK%5Cr%5CnContent-Type%3A%20text/plain%5Cr%5Cn%5Cr%5Cn%22%0A%20%20%20%20%20%20%20%20%20%20%20%20echo%20%22OK%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%3B%3B%0A%20%20%20%20%20%20%20%20%2A%29%0A%20%20%20%20%20%20%20%20%20%20%20%20echo%20-ne%20%22HTTP/1.1%20404%20Not%20Found%5Cr%5CnContent-Type%3A%20text/plain%5Cr%5Cn%5Cr%5Cn%22%0A%20%20%20%20%20%20%20%20%20%20%20%20echo%20%22Available%20endpoints%3A%20/logs/kubelet%2C%20/logs/crio%2C%20/logs/both%2C%20/health%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%3B%3B%0A%20%20%20%20esac%0A%7D%0A%0A%23%20Main%20server%20loop%0Aecho%20%22Starting%20OpenShift%20Node%20Log%20Collector%20Service%20on%20port%20%24PORT%22%0Awhile%20true%3B%20do%0A%20%20%20%20handle_request%20%7C%20nc%20-l%20-p%20%24PORT%20-q%201%0Adone%0A + mode: 493 + path: /usr/local/bin/node-log-collector.sh + systemd: + units: + - contents: | + [Unit] + Description=OpenShift Node Log Collector Service + Documentation=https://github.com/openshift/origin + After=network.target systemd-journald.service + Before=kubelet.service crio.service + DefaultDependencies=no + + [Service] + Type=simple + ExecStart=/usr/local/bin/node-log-collector.sh + Restart=always + RestartSec=5 + StandardOutput=journal + StandardError=journal + User=root + NoNewPrivileges=true + PrivateTmp=true + + [Install] + WantedBy=multi-user.target + enabled: true + name: node-log-collector.service