diff --git a/.gitignore b/.gitignore index 6b028ec3a..15534bfe1 100755 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ *.srl *.swp +src/RepairManager/id_rsa src/ClusterBootstrap/ssl/ca/ca.srl **/run.sh.generated @@ -56,6 +57,7 @@ src/WebUI/dotnet/WebPortal/hosting.json **/.suo **/storage.ide +src/RepairManager/.vs/* /.vs/slnx.sqlite /.vs/ProjectSettings.json /.vs/VSWorkspaceState.json diff --git a/src/RepairManager/Rules/__init__.py b/src/RepairManager/Rules/__init__.py new file mode 100644 index 000000000..1288a4bdf --- /dev/null +++ b/src/RepairManager/Rules/__init__.py @@ -0,0 +1,16 @@ +import os +import importlib + +dirpath = os.path.dirname(__file__) +dirname = os.path.basename(dirpath) + +# rules interface needs to be imported before dynamically importing all rules +importlib.import_module(dirname + ".rules_abc") + +for module in os.listdir(dirpath): + if module != '__init__.py' and module != 'rules_abc.py' and module[-3:] == '.py': + try: + importlib.import_module(dirname + "." + module[:-3]) + except Exception as e: + print("Could not import " + module) + print(e) diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py new file mode 100644 index 000000000..ff9407184 --- /dev/null +++ b/src/RepairManager/Rules/ecc_rule.py @@ -0,0 +1,97 @@ +from Rules.rules_abc import Rule +from kubernetes import client, config +import requests +import json +import os +import time +import yaml +import util +import logging + +def get_node_address_info(): + config.load_kube_config() + api_instance = client.CoreV1Api() + + service_account_list = api_instance.list_node() + + # map InternalIP to Hostname + address_map = {} + + if (service_account_list): + + for account in service_account_list.items: + internal_ip = None + hostname = None + + for address in account.status.addresses: + if address.type == 'InternalIP': + internal_ip = address.address + + if address.type == 'Hostname': + hostname = address.address + + address_map[internal_ip] = hostname + + logging.debug('node address map: %s ' % address_map) + + return address_map + + + +def get_ECC_error_data(ecc_url): + + response = requests.get(ecc_url) + data = json.loads(response.text) + + if data: + ecc_metrics = data['data']['result'] + logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics)) + + return ecc_metrics + + + +class ECCRule(Rule): + + def __init__(self): + self.ecc_hostnames = [] + + def check_status(self): + try: + with open('rule-config.yaml', 'r') as rule_config: + config = yaml.safe_load(rule_config) + + address_map = get_node_address_info() + + ecc_url = config['prometheus_url'] + config['rules']['ecc_rule']['ecc_error_url'] + ecc_metrics = get_ECC_error_data(ecc_url) + + if (ecc_metrics): + for m in ecc_metrics: + offending_node_ip = m['metric']['instance'].split(':')[0] + ecc_hostnames.append(address_map[offending_node_ip]) + + logging.info('Uncorrectable ECC metrics found: ' + ecc_hostnames) + return True + + else: + logging.debug('No uncorrectable ECC metrics found.') + return False + + + + except Exception as e: + logging.exception('Error checking status for ECCRule') + #TODO: send email alert, raise exception? + + def take_action(self): + try: + for node in ecc_hostnames: + success = util.cordon_node(node) + + if (success != 0): + logging.warning('Unscheduling of node ' + node + ' not successful') + + except Exception as e: + logging.exception('Error taking action for ECCRule') + #TODO: send email alert, rasie exception? diff --git a/src/RepairManager/Rules/rules_abc.py b/src/RepairManager/Rules/rules_abc.py new file mode 100644 index 000000000..278f7f2ec --- /dev/null +++ b/src/RepairManager/Rules/rules_abc.py @@ -0,0 +1,11 @@ +import abc + +class Rule(abc.ABC): + + @abc.abstractmethod + def check_status(self): + pass + + @abc.abstractmethod + def take_action(self): + pass diff --git a/src/RepairManager/logging.yaml b/src/RepairManager/logging.yaml new file mode 100644 index 000000000..f3ab1e247 --- /dev/null +++ b/src/RepairManager/logging.yaml @@ -0,0 +1,28 @@ +--- +version: 1 +disable_existing_loggers: False +formatters: + simple: + format: '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s' +handlers: + console: + class: logging.StreamHandler + level: DEBUG + formatter: simple + stream: ext://sys.stdout + file: + class : logging.handlers.RotatingFileHandler + formatter: simple + filename: repairmanager.log + # roll over at 10MB + maxBytes: 10240000 + # At most 10 logging files + backupCount: 10 +loggers: + basic: + level: DEBUG + handlers: ['console','file'] + propagate: no +root: + level: DEBUG + handlers: ['console','file'] diff --git a/src/RepairManager/main.py b/src/RepairManager/main.py new file mode 100755 index 000000000..a4f1463fe --- /dev/null +++ b/src/RepairManager/main.py @@ -0,0 +1,57 @@ +import time +import sys +import yaml +import logging +import logging.config +import importlib + +import Rules + + +with open('logging.yaml', 'r') as log_file: + log_config = yaml.safe_load(log_file) + +logging.config.dictConfig(log_config) +logger = logging.getLogger(__name__) + +logger.debug('Repair manager controller has started') + +try: + while True: + try: + #reload module + importlib.reload(Rules) + + # refresh config + with open('rule-config.yaml', 'r') as rule_file: + rule_config = yaml.safe_load(rule_file) + + rules = rule_config['rules'] + wait_time = rule_config['wait_time'] + + except Exception as e: + logger.exception('Error loading modules/rule config') + + # execute all rules listed in config + for r_key in rules.keys(): + try: + module_name = rules[r_key]['module_name'] + class_name = rules[r_key]['class_name'] + + r_module = sys.modules[module_name] + r_class = getattr(r_module, class_name) + rule = r_class() + + logger.debug('Executing ' + class_name + ' from module ' + module_name) + + if rule.check_status(): + rule.take_action() + + time.sleep(wait_time) + + except Exception as e: + logger.exception('Error executing ' + class_name + ' from module ' + module_name) + #TODO: send email alert? + +except KeyboardInterrupt: + pass diff --git a/src/RepairManager/requirements.txt b/src/RepairManager/requirements.txt new file mode 100644 index 000000000..2bf6889ff --- /dev/null +++ b/src/RepairManager/requirements.txt @@ -0,0 +1,3 @@ +kubernetes==10.0.1 +requests==2.18.4 +PyYAML==5.1.2 diff --git a/src/RepairManager/rule-config.yaml b/src/RepairManager/rule-config.yaml new file mode 100755 index 000000000..f3379113b --- /dev/null +++ b/src/RepairManager/rule-config.yaml @@ -0,0 +1,13 @@ +--- +rules: + ecc_rule: + module_name : Rules.ecc_rule + class_name : ECCRule + ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' + +# time to sleep between rule execution +wait_time: 10 + +# prometheus +prometheus_url: 'http://localhost:9091/prometheus' + diff --git a/src/RepairManager/util.py b/src/RepairManager/util.py new file mode 100644 index 000000000..8be734c31 --- /dev/null +++ b/src/RepairManager/util.py @@ -0,0 +1,6 @@ +import os + +def cordon_node(node): + output = os.system('kubectl cordon %s --dry-run' % node) + return output +