From 271225fa368aca1c68d18bba128ae85ee599221d Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Wed, 6 Nov 2019 22:13:22 +0000 Subject: [PATCH 1/4] ECC Repair Manager Initial Check-In --- .gitignore | 2 + src/RepairManager/Rules/__init__.py | 16 +++++ src/RepairManager/Rules/ecc_rule.py | 95 ++++++++++++++++++++++++++++ src/RepairManager/Rules/rules_abc.py | 11 ++++ src/RepairManager/Rules/test_rule.py | 11 ++++ src/RepairManager/logging.yaml | 28 ++++++++ src/RepairManager/main.py | 58 +++++++++++++++++ src/RepairManager/requirements.txt | 3 + src/RepairManager/rule-config.yaml | 12 ++++ src/RepairManager/util.py | 6 ++ 10 files changed, 242 insertions(+) create mode 100644 src/RepairManager/Rules/__init__.py create mode 100644 src/RepairManager/Rules/ecc_rule.py create mode 100644 src/RepairManager/Rules/rules_abc.py create mode 100644 src/RepairManager/Rules/test_rule.py create mode 100644 src/RepairManager/logging.yaml create mode 100755 src/RepairManager/main.py create mode 100644 src/RepairManager/requirements.txt create mode 100755 src/RepairManager/rule-config.yaml create mode 100644 src/RepairManager/util.py diff --git a/.gitignore b/.gitignore index 6b028ec3a..15534bfe1 100755 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ *.srl *.swp +src/RepairManager/id_rsa src/ClusterBootstrap/ssl/ca/ca.srl **/run.sh.generated @@ -56,6 +57,7 @@ src/WebUI/dotnet/WebPortal/hosting.json **/.suo **/storage.ide +src/RepairManager/.vs/* /.vs/slnx.sqlite /.vs/ProjectSettings.json /.vs/VSWorkspaceState.json diff --git a/src/RepairManager/Rules/__init__.py b/src/RepairManager/Rules/__init__.py new file mode 100644 index 000000000..1288a4bdf --- /dev/null +++ b/src/RepairManager/Rules/__init__.py @@ -0,0 +1,16 @@ +import os +import importlib + +dirpath = os.path.dirname(__file__) +dirname = os.path.basename(dirpath) + +# rules interface needs to be imported before dynamically importing all rules +importlib.import_module(dirname + ".rules_abc") + +for module in os.listdir(dirpath): + if module != '__init__.py' and module != 'rules_abc.py' and module[-3:] == '.py': + try: + importlib.import_module(dirname + "." + module[:-3]) + except Exception as e: + print("Could not import " + module) + print(e) diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py new file mode 100644 index 000000000..c44f79bb1 --- /dev/null +++ b/src/RepairManager/Rules/ecc_rule.py @@ -0,0 +1,95 @@ +from Rules.rules_abc import Rule +from kubernetes import client, config +import requests +import json +import os +import time +import yaml +import util +import logging + +def get_node_address_info(): + config.load_kube_config() + api_instance = client.CoreV1Api() + + service_account_list = api_instance.list_node() + + # map InternalIP to Hostname + address_map = {} + + if (service_account_list): + + for account in service_account_list.items: + internal_ip = None + hostname = None + + for address in account.status.addresses: + if address.type == 'InternalIP': + internal_ip = address.address + + if address.type == 'Hostname': + hostname = address.address + + address_map[internal_ip] = hostname + + logging.debug('node address map: %s ' % address_map) + + return address_map + + + +def get_ECC_error_data(ecc_url): + + response = requests.get(ecc_url) + data = json.loads(response.text) + + if data: + ecc_metrics = data['data']['result'] + logging.info('ECC error metrics from prometheus ' + json.dumps(ecc_metrics, indent=4)) + + return ecc_metrics + + + +class ECCRule(Rule): + + def check_status(self): + try: + with open('rule-config.yaml', 'r') as rule_config: + config = yaml.safe_load(rule_config) + + address_map = get_node_address_info() + + ecc_url = config['ecc_prometheus_url'] + ecc_metrics = get_ECC_error_data(ecc_url) + + global ecc_hostnames + ecc_hostnames = [] + + if (ecc_metrics): + for m in ecc_metrics: + offending_node_ip = m['metric']['instance'].split(':')[0] + ecc_hostnames.append(address_map[offending_node_ip]) + + logging.info('Uncorrectable ECC metrics found: ' + ecc_hostnames) + return True + + else: + logging.debug('No uncorrectable ECC metrics found.') + return False + + except Exception as e: + logging.exception('Error checking status for ECCRule') + #TODO: send email alert, raise exception? + + def take_action(self): + try: + for node in ecc_hostnames: + success = util.cordon_node(node) + + if (success != 0): + logging.warning('Unscheduling of nodes not successful') + + except Exception as e: + logging.exception('Error taking action for ECCRule') + #TODO: send email alert, rasie exception? diff --git a/src/RepairManager/Rules/rules_abc.py b/src/RepairManager/Rules/rules_abc.py new file mode 100644 index 000000000..278f7f2ec --- /dev/null +++ b/src/RepairManager/Rules/rules_abc.py @@ -0,0 +1,11 @@ +import abc + +class Rule(abc.ABC): + + @abc.abstractmethod + def check_status(self): + pass + + @abc.abstractmethod + def take_action(self): + pass diff --git a/src/RepairManager/Rules/test_rule.py b/src/RepairManager/Rules/test_rule.py new file mode 100644 index 000000000..30e9c05a4 --- /dev/null +++ b/src/RepairManager/Rules/test_rule.py @@ -0,0 +1,11 @@ +from Rules.rules_abc import Rule + +class TestRule(Rule): + + def check_status(self): + print("Test Rule, Checking Status...\n") + + return True + + def take_action(self): + print("Test Rule, Taking Action...\n") diff --git a/src/RepairManager/logging.yaml b/src/RepairManager/logging.yaml new file mode 100644 index 000000000..f3ab1e247 --- /dev/null +++ b/src/RepairManager/logging.yaml @@ -0,0 +1,28 @@ +--- +version: 1 +disable_existing_loggers: False +formatters: + simple: + format: '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s' +handlers: + console: + class: logging.StreamHandler + level: DEBUG + formatter: simple + stream: ext://sys.stdout + file: + class : logging.handlers.RotatingFileHandler + formatter: simple + filename: repairmanager.log + # roll over at 10MB + maxBytes: 10240000 + # At most 10 logging files + backupCount: 10 +loggers: + basic: + level: DEBUG + handlers: ['console','file'] + propagate: no +root: + level: DEBUG + handlers: ['console','file'] diff --git a/src/RepairManager/main.py b/src/RepairManager/main.py new file mode 100755 index 000000000..7c04c4de8 --- /dev/null +++ b/src/RepairManager/main.py @@ -0,0 +1,58 @@ +import time +import sys +import yaml +import logging +import logging.config +import importlib + +import Rules + + +with open('logging.yaml', 'r') as log_file: + log_config = yaml.safe_load(log_file) + +logging.config.dictConfig(log_config) +logger = logging.getLogger('basic') +logger.setLevel(logging.DEBUG) + +logger.debug('Repair manager controller has started') + +try: + while True: + try: + #reload module + importlib.reload(Rules) + + # refresh config + with open('rule-config.yaml', 'r') as rule_file: + rule_config = yaml.safe_load(rule_file) + + rules = rule_config['rules'] + wait_time = rule_config['wait_time'] + + except Exception as e: + logger.error('Error loading modules/rule config') + logger.error(e) + + # execute all rules listed in config + for module_name in rules: + try: + module = sys.modules[module_name] + class_name = rules[module_name] + rule_class = getattr(module, class_name) + rule = rule_class() + + logger.debug('Executing ' + class_name+ ' from module ' + module_name) + + if (rule.check_status()): + rule.take_action() + + time.sleep(wait_time) + + except Exception as e: + logger.error('Error executing ' + class_name + ' from modul e' + module_name) + logger.error(e) + #TODO: send email alert? + +except KeyboardInterrupt: + pass diff --git a/src/RepairManager/requirements.txt b/src/RepairManager/requirements.txt new file mode 100644 index 000000000..2bf6889ff --- /dev/null +++ b/src/RepairManager/requirements.txt @@ -0,0 +1,3 @@ +kubernetes==10.0.1 +requests==2.18.4 +PyYAML==5.1.2 diff --git a/src/RepairManager/rule-config.yaml b/src/RepairManager/rule-config.yaml new file mode 100755 index 000000000..9be74e25f --- /dev/null +++ b/src/RepairManager/rule-config.yaml @@ -0,0 +1,12 @@ +--- +# [Rule Module Name] : [Rule Class Name] +rules: + Rules.ecc_rule : ECCRule + Rules.test_rule : TestRule + +# time to sleep between rule execution +wait_time: 10 + +# prometheus ecc error query +ecc_prometheus_url: 'http://localhost:9091/prometheus/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' + diff --git a/src/RepairManager/util.py b/src/RepairManager/util.py new file mode 100644 index 000000000..8be734c31 --- /dev/null +++ b/src/RepairManager/util.py @@ -0,0 +1,6 @@ +import os + +def cordon_node(node): + output = os.system('kubectl cordon %s --dry-run' % node) + return output + From 1e2d41eea9a75afcb02f2cbf99af2a3abdfa8396 Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Wed, 6 Nov 2019 23:25:49 +0000 Subject: [PATCH 2/4] remove test rule from config --- src/RepairManager/rule-config.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/RepairManager/rule-config.yaml b/src/RepairManager/rule-config.yaml index 9be74e25f..7d3e720f4 100755 --- a/src/RepairManager/rule-config.yaml +++ b/src/RepairManager/rule-config.yaml @@ -2,8 +2,7 @@ # [Rule Module Name] : [Rule Class Name] rules: Rules.ecc_rule : ECCRule - Rules.test_rule : TestRule - + # time to sleep between rule execution wait_time: 10 From ec0dd345c9b32f541bf96267a45ec8536609c6dc Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Sat, 9 Nov 2019 04:25:06 +0000 Subject: [PATCH 3/4] pr feedback --- src/RepairManager/Rules/ecc_rule.py | 14 ++++++++------ src/RepairManager/main.py | 25 ++++++++++++------------- src/RepairManager/rule-config.yaml | 10 ++++++---- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py index c44f79bb1..ff9407184 100644 --- a/src/RepairManager/Rules/ecc_rule.py +++ b/src/RepairManager/Rules/ecc_rule.py @@ -45,7 +45,7 @@ def get_ECC_error_data(ecc_url): if data: ecc_metrics = data['data']['result'] - logging.info('ECC error metrics from prometheus ' + json.dumps(ecc_metrics, indent=4)) + logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics)) return ecc_metrics @@ -53,6 +53,9 @@ def get_ECC_error_data(ecc_url): class ECCRule(Rule): + def __init__(self): + self.ecc_hostnames = [] + def check_status(self): try: with open('rule-config.yaml', 'r') as rule_config: @@ -60,12 +63,9 @@ def check_status(self): address_map = get_node_address_info() - ecc_url = config['ecc_prometheus_url'] + ecc_url = config['prometheus_url'] + config['rules']['ecc_rule']['ecc_error_url'] ecc_metrics = get_ECC_error_data(ecc_url) - global ecc_hostnames - ecc_hostnames = [] - if (ecc_metrics): for m in ecc_metrics: offending_node_ip = m['metric']['instance'].split(':')[0] @@ -78,6 +78,8 @@ def check_status(self): logging.debug('No uncorrectable ECC metrics found.') return False + + except Exception as e: logging.exception('Error checking status for ECCRule') #TODO: send email alert, raise exception? @@ -88,7 +90,7 @@ def take_action(self): success = util.cordon_node(node) if (success != 0): - logging.warning('Unscheduling of nodes not successful') + logging.warning('Unscheduling of node ' + node + ' not successful') except Exception as e: logging.exception('Error taking action for ECCRule') diff --git a/src/RepairManager/main.py b/src/RepairManager/main.py index 7c04c4de8..a4f1463fe 100755 --- a/src/RepairManager/main.py +++ b/src/RepairManager/main.py @@ -12,8 +12,7 @@ log_config = yaml.safe_load(log_file) logging.config.dictConfig(log_config) -logger = logging.getLogger('basic') -logger.setLevel(logging.DEBUG) +logger = logging.getLogger(__name__) logger.debug('Repair manager controller has started') @@ -31,27 +30,27 @@ wait_time = rule_config['wait_time'] except Exception as e: - logger.error('Error loading modules/rule config') - logger.error(e) + logger.exception('Error loading modules/rule config') # execute all rules listed in config - for module_name in rules: + for r_key in rules.keys(): try: - module = sys.modules[module_name] - class_name = rules[module_name] - rule_class = getattr(module, class_name) - rule = rule_class() + module_name = rules[r_key]['module_name'] + class_name = rules[r_key]['class_name'] - logger.debug('Executing ' + class_name+ ' from module ' + module_name) + r_module = sys.modules[module_name] + r_class = getattr(r_module, class_name) + rule = r_class() + + logger.debug('Executing ' + class_name + ' from module ' + module_name) - if (rule.check_status()): + if rule.check_status(): rule.take_action() time.sleep(wait_time) except Exception as e: - logger.error('Error executing ' + class_name + ' from modul e' + module_name) - logger.error(e) + logger.exception('Error executing ' + class_name + ' from module ' + module_name) #TODO: send email alert? except KeyboardInterrupt: diff --git a/src/RepairManager/rule-config.yaml b/src/RepairManager/rule-config.yaml index 7d3e720f4..f3379113b 100755 --- a/src/RepairManager/rule-config.yaml +++ b/src/RepairManager/rule-config.yaml @@ -1,11 +1,13 @@ --- -# [Rule Module Name] : [Rule Class Name] rules: - Rules.ecc_rule : ECCRule + ecc_rule: + module_name : Rules.ecc_rule + class_name : ECCRule + ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' # time to sleep between rule execution wait_time: 10 -# prometheus ecc error query -ecc_prometheus_url: 'http://localhost:9091/prometheus/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' +# prometheus +prometheus_url: 'http://localhost:9091/prometheus' From 6864fb29a9629f194ea0dec54e4cc82cc80dfd7d Mon Sep 17 00:00:00 2001 From: Deborah Sandoval Date: Fri, 8 Nov 2019 20:35:59 -0800 Subject: [PATCH 4/4] remove test_rule.py --- src/RepairManager/Rules/test_rule.py | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 src/RepairManager/Rules/test_rule.py diff --git a/src/RepairManager/Rules/test_rule.py b/src/RepairManager/Rules/test_rule.py deleted file mode 100644 index 30e9c05a4..000000000 --- a/src/RepairManager/Rules/test_rule.py +++ /dev/null @@ -1,11 +0,0 @@ -from Rules.rules_abc import Rule - -class TestRule(Rule): - - def check_status(self): - print("Test Rule, Checking Status...\n") - - return True - - def take_action(self): - print("Test Rule, Taking Action...\n")