diff --git a/assisted_service_mcp/src/utils/log_analyzer/log_analyzer.py b/assisted_service_mcp/src/utils/log_analyzer/log_analyzer.py index d0c6850..da2df16 100644 --- a/assisted_service_mcp/src/utils/log_analyzer/log_analyzer.py +++ b/assisted_service_mcp/src/utils/log_analyzer/log_analyzer.py @@ -4,7 +4,7 @@ import json import logging -from typing import Dict, List, Any, cast +from typing import Dict, List, Any, cast, Iterator, Tuple import dateutil.parser import nestedarchive @@ -40,9 +40,7 @@ def metadata(self) -> Dict[str, Any] | None: raw_metadata = json.loads(cast(str | bytes, metadata_content)) # The metadata file contains cluster information at the root level - # Wrap it in a "cluster" key to match the expected structure - wrapped_metadata = {"cluster": raw_metadata} - self._metadata = self._clean_metadata_json(wrapped_metadata) + self._metadata = self._clean_metadata_json(raw_metadata) except Exception as e: logger.error("Failed to load metadata: %s", e) raise @@ -51,20 +49,18 @@ def metadata(self) -> Dict[str, Any] | None: @staticmethod def _clean_metadata_json(md: Dict[str, Any]) -> Dict[str, Any]: """Clean metadata JSON by separating deleted hosts.""" - installation_start_time = dateutil.parser.isoparse( - md["cluster"]["install_started_at"] - ) + installation_start_time = dateutil.parser.isoparse(md["install_started_at"]) def host_deleted_before_installation_started(host): if deleted_at := host.get("deleted_at"): return dateutil.parser.isoparse(deleted_at) < installation_start_time return False - all_hosts = md["cluster"]["hosts"] - md["cluster"]["deleted_hosts"] = [ + all_hosts = md["hosts"] + md["deleted_hosts"] = [ h for h in all_hosts if host_deleted_before_installation_started(h) ] - md["cluster"]["hosts"] = [ + md["hosts"] = [ h for h in all_hosts if not host_deleted_before_installation_started(h) ] @@ -181,15 +177,42 @@ def get_controller_logs(self) -> str: ), ) - @staticmethod - def get_hostname(host: Dict[str, Any]) -> str: - """Extract hostname from host metadata.""" - hostname = host.get("requested_hostname") - if hostname: - return hostname + def cluster_is_sno(self) -> bool: + """ + Check if the cluster is a Single Node OpenShift (SNO) cluster. + + Returns: + True if the cluster is SNO (high_availability_mode == "None"), False otherwise + """ + try: + cluster = self.metadata + return ( + cluster is not None and cluster.get("high_availability_mode") == "None" + ) + except Exception: + return False + + def all_host_journal_logs( + self, + ) -> Iterator[Tuple[Dict[str, Any], str]]: + """ + Iterate over hosts and their journal logs, skipping hosts where journal.logs is not found. + Yields: + Tuple of (host, journal_logs) for each host with available journal logs + """ try: - inventory = json.loads(host["inventory"]) - return inventory["hostname"] - except (KeyError, json.JSONDecodeError): - return host.get("id", "unknown") + cluster = self.metadata + except Exception: + return + + if cluster is None: + return + + for host in cluster.get("hosts", []): + host_id = host["id"] + try: + journal_logs = self.get_host_log_file(host_id, "journal.logs") + yield host, journal_logs + except FileNotFoundError: + continue diff --git a/assisted_service_mcp/src/utils/log_analyzer/signatures/__init__.py b/assisted_service_mcp/src/utils/log_analyzer/signatures/__init__.py index a1b809d..8c5a26f 100644 --- a/assisted_service_mcp/src/utils/log_analyzer/signatures/__init__.py +++ b/assisted_service_mcp/src/utils/log_analyzer/signatures/__init__.py @@ -6,12 +6,44 @@ import inspect from .base import Signature, ErrorSignature, SignatureResult -from .basic_info import * # noqa -from .error_detection import * # noqa -from .performance import * # noqa -from .networking import * # noqa -from .advanced_analysis import * # noqa -from .platform_specific import * # noqa + +# Import all individual signature classes +# These are used dynamically via inspect.getmembers(), so we suppress unused import warnings +from .components_version_signature import ComponentsVersionSignature # noqa: F401 +from .sno_hostname_has_etcd import SNOHostnameHasEtcd # noqa: F401 +from .api_invalid_certificate_signature import ( + ApiInvalidCertificateSignature, # noqa: F401 +) +from .api_expired_certificate_signature import ( + ApiExpiredCertificateSignature, # noqa: F401 +) +from .release_pull_error_signature import ReleasePullErrorSignature # noqa: F401 +from .error_on_cleanup_install_device import ErrorOnCleanupInstallDevice # noqa: F401 +from .missing_mc import MissingMC # noqa: F401 +from .error_creating_read_write_layer import ErrorCreatingReadWriteLayer # noqa: F401 +from .sno_machine_cidr_signature import SNOMachineCidrSignature # noqa: F401 +from .duplicate_vip import DuplicateVIP # noqa: F401 +from .nameserver_in_cluster_network import NameserverInClusterNetwork # noqa: F401 +from .networks_mtu_mismatch import NetworksMtuMismatch # noqa: F401 +from .dual_stack_bad_route import DualStackBadRoute # noqa: F401 +from .dualstackr_dns_bug import DualstackrDNSBug # noqa: F401 +from .user_managed_networking_load_balancer import ( + UserManagedNetworkingLoadBalancer, # noqa: F401 +) +from .slow_image_download_signature import SlowImageDownloadSignature # noqa: F401 +from .libvirt_reboot_flag_signature import LibvirtRebootFlagSignature # noqa: F401 +from .ip_changed_after_reboot import IpChangedAfterReboot # noqa: F401 +from .events_installation_attempts import EventsInstallationAttempts # noqa: F401 +from .controller_warnings import ControllerWarnings # noqa: F401 +from .user_has_logged_into_cluster import UserHasLoggedIntoCluster # noqa: F401 +from .failed_request_triggers_host_timeout import ( + FailedRequestTriggersHostTimeout, # noqa: F401 +) +from .controller_failed_to_start import ControllerFailedToStart # noqa: F401 +from .machine_config_daemon_error_extracting import ( + MachineConfigDaemonErrorExtracting, # noqa: F401 +) +from .container_crash_analysis import ContainerCrashAnalysis # noqa: F401 # Collect all signatures from all modules ALL_SIGNATURES = [] diff --git a/assisted_service_mcp/src/utils/log_analyzer/signatures/api_expired_certificate_signature.py b/assisted_service_mcp/src/utils/log_analyzer/signatures/api_expired_certificate_signature.py new file mode 100644 index 0000000..755dce3 --- /dev/null +++ b/assisted_service_mcp/src/utils/log_analyzer/signatures/api_expired_certificate_signature.py @@ -0,0 +1,39 @@ +""" +ApiExpiredCertificateSignature for OpenShift Assisted Installer logs. +""" + +import logging +import re +from typing import Optional + +from assisted_service_mcp.src.utils.log_analyzer.log_analyzer import ( + LOG_BUNDLE_PATH, +) + +from .base import ErrorSignature, SignatureResult + +logger = logging.getLogger(__name__) + + +class ApiExpiredCertificateSignature(ErrorSignature): + """Detect expired or not yet valid certificate in kube-apiserver logs.""" + + LOG_PATTERN = re.compile("x509: certificate has expired or is not yet valid.*") + + def analyze(self, log_analyzer) -> Optional[SignatureResult]: + path = f"{LOG_BUNDLE_PATH}/bootstrap/containers/bootstrap-control-plane/kube-apiserver.log" + try: + logs = log_analyzer.logs_archive.get(path) + except FileNotFoundError: + return None + invalid_api_log_lines = self.LOG_PATTERN.findall(logs) + if invalid_api_log_lines: + content = invalid_api_log_lines[0] + if (num_lines := len(invalid_api_log_lines)) > 1: + content += f"\nadditional {num_lines - 1} similar error log lines found" + return self.create_result( + title="Expired Certificate", + content=content, + severity="error", + ) + return None diff --git a/assisted_service_mcp/src/utils/log_analyzer/signatures/api_invalid_certificate_signature.py b/assisted_service_mcp/src/utils/log_analyzer/signatures/api_invalid_certificate_signature.py new file mode 100644 index 0000000..5a2cd6c --- /dev/null +++ b/assisted_service_mcp/src/utils/log_analyzer/signatures/api_invalid_certificate_signature.py @@ -0,0 +1,39 @@ +""" +ApiInvalidCertificateSignature for OpenShift Assisted Installer logs. +""" + +import logging +import re +from typing import Optional + +from .base import ErrorSignature, SignatureResult + +logger = logging.getLogger(__name__) + + +class ApiInvalidCertificateSignature(ErrorSignature): + """Detect invalid SAN values on certificate for AI API from controller logs.""" + + LOG_PATTERN = re.compile( + 'time=".*" level=error msg=".*x509: certificate is valid.* not .*' + ) + + def analyze(self, log_analyzer) -> Optional[SignatureResult]: + try: + controller_logs = log_analyzer.get_controller_logs() + except FileNotFoundError: + return None + + invalid_api_log_lines = self.LOG_PATTERN.findall(controller_logs) + if invalid_api_log_lines: + shown = invalid_api_log_lines[:5] + more = len(invalid_api_log_lines) - len(shown) + content = "\n".join(shown) + if more > 0: + content += f"\nadditional {more} similar error log lines found" + return self.create_result( + title="Invalid SAN values on certificate for AI API", + content=content, + severity="error", + ) + return None diff --git a/assisted_service_mcp/src/utils/log_analyzer/signatures/basic_info.py b/assisted_service_mcp/src/utils/log_analyzer/signatures/components_version_signature.py similarity index 81% rename from assisted_service_mcp/src/utils/log_analyzer/signatures/basic_info.py rename to assisted_service_mcp/src/utils/log_analyzer/signatures/components_version_signature.py index 9faa959..1b9964f 100644 --- a/assisted_service_mcp/src/utils/log_analyzer/signatures/basic_info.py +++ b/assisted_service_mcp/src/utils/log_analyzer/signatures/components_version_signature.py @@ -1,6 +1,5 @@ """ -Basic information and status signature analysis. -These signatures provide fundamental information about the cluster and installation. +ComponentsVersionSignature for OpenShift Assisted Installer logs. """ import logging @@ -18,15 +17,14 @@ def analyze(self, log_analyzer) -> Optional[SignatureResult]: """Analyze component versions.""" try: metadata = log_analyzer.metadata - cluster_md = metadata.get("cluster", {}) content_lines = [] - release_tag = metadata.get("release_tag") or cluster_md.get("release_tag") + release_tag = metadata.get("release_tag") if release_tag: content_lines.append(f"Release tag: {release_tag}") - versions = metadata.get("versions") or cluster_md.get("versions") + versions = metadata.get("versions") if versions: if "assisted-installer" in versions: content_lines.append( diff --git a/assisted_service_mcp/src/utils/log_analyzer/signatures/advanced_analysis.py b/assisted_service_mcp/src/utils/log_analyzer/signatures/container_crash_analysis.py similarity index 57% rename from assisted_service_mcp/src/utils/log_analyzer/signatures/advanced_analysis.py rename to assisted_service_mcp/src/utils/log_analyzer/signatures/container_crash_analysis.py index 5fea2fd..b8d1582 100644 --- a/assisted_service_mcp/src/utils/log_analyzer/signatures/advanced_analysis.py +++ b/assisted_service_mcp/src/utils/log_analyzer/signatures/container_crash_analysis.py @@ -1,16 +1,14 @@ """ -Advanced analysis signatures for OpenShift Assisted Installer logs. -These signatures perform complex analysis across multiple log sources. +ContainerCrashAnalysis signature for OpenShift Assisted Installer logs. """ -import json import logging import os import re from collections import defaultdict from datetime import datetime, timedelta from operator import itemgetter -from typing import Any, Generator, Optional, Callable, List, Dict +from typing import Optional, List, Dict from assisted_service_mcp.src.utils.log_analyzer.log_analyzer import ( LOG_BUNDLE_PATH, @@ -18,281 +16,9 @@ from .base import Signature, SignatureResult - -def operator_statuses_from_controller_logs( - controller_log: str, include_empty: bool = False -): - operator_regex = re.compile(r"Operator ([a-z\-]+), statuses: \[(.*)\].*") - conditions_regex = re.compile(r"\{(.+?)\}") - condition_regex = re.compile( - r"([A-Za-z]+) (False|True) ([0-9a-zA-Z\-]+ [0-9a-zA-Z\:]+ [0-9a-zA-Z\-\+]+ [A-Z]+) (.*)" - ) - operator_statuses = {} - - for operator_name, operator_status in operator_regex.findall(controller_log): - if include_empty: - operator_statuses[operator_name] = {} - operator_conditions = operator_statuses.setdefault(operator_name, {}) - for operator_conditions_raw in conditions_regex.findall(operator_status): - for ( - condition_name, - condition_result, - condition_timestamp, - condition_reason, - ) in condition_regex.findall(operator_conditions_raw): - operator_conditions[condition_name] = { - "result": condition_result == "True", - "timestamp": condition_timestamp, - "reason": condition_reason, - } - - return operator_statuses - - -def condition_has_result( - operator_conditions, expected_condition_name: str, expected_condition_result: bool -) -> bool: - return any( - condition_values["result"] == expected_condition_result - for condition_name, condition_values in operator_conditions.items() - if condition_name == expected_condition_name - ) - - -def filter_operators( - operator_statuses, - required_conditions, - aggregation_function: Callable[[Generator[Any, None, None]], bool], -): - return { - operator_name: operator_conditions - for operator_name, operator_conditions in operator_statuses.items() - if aggregation_function( - condition_has_result( - operator_conditions, required_condition_name, expected_condition_result - ) - for required_condition_name, expected_condition_result in required_conditions - ) - } - - logger = logging.getLogger(__name__) -class EventsInstallationAttempts(Signature): - """Inspects events file to check for multiple installation attempts.""" - - def analyze(self, log_analyzer) -> Optional[SignatureResult]: - """Analyze multiple installation attempts.""" - try: - # Get all cluster events and partition them by reset events - all_events = log_analyzer.get_all_cluster_events() - partitions = log_analyzer.partition_cluster_events(all_events) - installation_attempts = len(partitions) - - if installation_attempts != 1: - current_events = log_analyzer.get_last_install_cluster_events() - if current_events: - last_attempt_first_event = current_events[0] - content = ( - f"The events file for this cluster contains events from {installation_attempts} installation attempts.\n" - f"When reading the events for this ticket, make sure you look only at the events for the last installation attempt,\n" - f"the first event in that attempt happened around {last_attempt_first_event['event_time']}." - ) - - return SignatureResult( - signature_name=self.name, - title="Multiple Installation Attempts in Events File", - content=content, - severity="warning", - ) - - except Exception as e: - logger.error("Error in EventsInstallationAttempts: %s", e) - - return None - - -class ControllerWarnings(Signature): - """Search for warnings in controller logs.""" - - def analyze(self, log_analyzer) -> Optional[SignatureResult]: - try: - controller_logs = log_analyzer.get_controller_logs() - except FileNotFoundError: - return None - warnings = re.findall(r'time=".*" level=warning msg=".*', controller_logs) - if warnings: - shown = warnings[:10] - content = "\n".join(shown) - if len(warnings) > 10: - content += ( - f"\nThere are {len(warnings) - 10} additional warnings not shown" - ) - return SignatureResult( - signature_name=self.name, - title="Controller warning logs", - content=content, - severity="warning", - ) - return None - - -class UserHasLoggedIntoCluster(Signature): - """Detect user login to cluster nodes during installation.""" - - USER_LOGIN_PATTERN = re.compile( - r"pam_unix\((sshd|login):session\): session opened for user .+ by" - ) - - def analyze(self, log_analyzer) -> Optional[SignatureResult]: - cluster = log_analyzer.metadata.get("cluster", {}) - msgs = [] - for host in cluster.get("hosts", []): - host_id = host["id"] - try: - journal_logs = log_analyzer.get_host_log_file(host_id, "journal.logs") - except FileNotFoundError: - continue - if self.USER_LOGIN_PATTERN.findall(journal_logs): - msgs.append( - f"Host {host_id}: found evidence of a user login during installation. This might indicate that some settings have been changed manually; if incorrect they could contribute to failure." - ) - if msgs: - return SignatureResult( - signature_name=self.name, - title="User has logged into cluster nodes during installation", - content="\n".join(msgs), - severity="warning", - ) - return None - - -class FailedRequestTriggersHostTimeout(Signature): - """Look for failed requests that could have caused host timeout.""" - - LOG_PATTERN = re.compile( - r'time="(?P