Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

repair manager initial code check-in #644

Merged
merged 6 commits into from
Nov 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*.srl
*.swp

src/RepairManager/id_rsa
src/ClusterBootstrap/ssl/ca/ca.srl
**/run.sh.generated

Expand Down Expand Up @@ -56,6 +57,7 @@ src/WebUI/dotnet/WebPortal/hosting.json
**/.suo
**/storage.ide

src/RepairManager/.vs/*
/.vs/slnx.sqlite
/.vs/ProjectSettings.json
/.vs/VSWorkspaceState.json
Expand Down
16 changes: 16 additions & 0 deletions src/RepairManager/Rules/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os
import importlib

dirpath = os.path.dirname(__file__)
dirname = os.path.basename(dirpath)

# rules interface needs to be imported before dynamically importing all rules
importlib.import_module(dirname + ".rules_abc")

for module in os.listdir(dirpath):
if module != '__init__.py' and module != 'rules_abc.py' and module[-3:] == '.py':
try:
importlib.import_module(dirname + "." + module[:-3])
except Exception as e:
print("Could not import " + module)
print(e)
97 changes: 97 additions & 0 deletions src/RepairManager/Rules/ecc_rule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from Rules.rules_abc import Rule
from kubernetes import client, config
import requests
import json
import os
import time
import yaml
import util
import logging

def get_node_address_info():
config.load_kube_config()
api_instance = client.CoreV1Api()

service_account_list = api_instance.list_node()

# map InternalIP to Hostname
address_map = {}

if (service_account_list):

for account in service_account_list.items:
internal_ip = None
hostname = None

for address in account.status.addresses:
if address.type == 'InternalIP':
internal_ip = address.address

if address.type == 'Hostname':
hostname = address.address

address_map[internal_ip] = hostname

logging.debug('node address map: %s ' % address_map)

return address_map



def get_ECC_error_data(ecc_url):

response = requests.get(ecc_url)
data = json.loads(response.text)

if data:
ecc_metrics = data['data']['result']
logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics))

return ecc_metrics



class ECCRule(Rule):

def __init__(self):
self.ecc_hostnames = []

def check_status(self):
try:
with open('rule-config.yaml', 'r') as rule_config:
config = yaml.safe_load(rule_config)

address_map = get_node_address_info()

ecc_url = config['prometheus_url'] + config['rules']['ecc_rule']['ecc_error_url']
ecc_metrics = get_ECC_error_data(ecc_url)

if (ecc_metrics):
for m in ecc_metrics:
offending_node_ip = m['metric']['instance'].split(':')[0]
ecc_hostnames.append(address_map[offending_node_ip])

logging.info('Uncorrectable ECC metrics found: ' + ecc_hostnames)
return True

else:
logging.debug('No uncorrectable ECC metrics found.')
return False



except Exception as e:
logging.exception('Error checking status for ECCRule')
#TODO: send email alert, raise exception?

def take_action(self):
try:
for node in ecc_hostnames:
success = util.cordon_node(node)

if (success != 0):
logging.warning('Unscheduling of node ' + node + ' not successful')

except Exception as e:
logging.exception('Error taking action for ECCRule')
#TODO: send email alert, rasie exception?
11 changes: 11 additions & 0 deletions src/RepairManager/Rules/rules_abc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import abc

class Rule(abc.ABC):

@abc.abstractmethod
def check_status(self):
pass

@abc.abstractmethod
def take_action(self):
pass
28 changes: 28 additions & 0 deletions src/RepairManager/logging.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
version: 1
disable_existing_loggers: False
formatters:
simple:
format: '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s'
handlers:
console:
class: logging.StreamHandler
level: DEBUG
formatter: simple
stream: ext://sys.stdout
file:
class : logging.handlers.RotatingFileHandler
formatter: simple
filename: repairmanager.log
# roll over at 10MB
maxBytes: 10240000
# At most 10 logging files
backupCount: 10
loggers:
basic:
level: DEBUG
handlers: ['console','file']
propagate: no
root:
level: DEBUG
handlers: ['console','file']
57 changes: 57 additions & 0 deletions src/RepairManager/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import time
import sys
import yaml
import logging
import logging.config
import importlib

import Rules


with open('logging.yaml', 'r') as log_file:
log_config = yaml.safe_load(log_file)

logging.config.dictConfig(log_config)
logger = logging.getLogger(__name__)

logger.debug('Repair manager controller has started')

try:
while True:
try:
#reload module
importlib.reload(Rules)

# refresh config
with open('rule-config.yaml', 'r') as rule_file:
rule_config = yaml.safe_load(rule_file)

rules = rule_config['rules']
wait_time = rule_config['wait_time']

except Exception as e:
logger.exception('Error loading modules/rule config')

# execute all rules listed in config
for r_key in rules.keys():
try:
module_name = rules[r_key]['module_name']
class_name = rules[r_key]['class_name']

r_module = sys.modules[module_name]
r_class = getattr(r_module, class_name)
rule = r_class()

logger.debug('Executing ' + class_name + ' from module ' + module_name)

if rule.check_status():
rule.take_action()

time.sleep(wait_time)

except Exception as e:
logger.exception('Error executing ' + class_name + ' from module ' + module_name)
#TODO: send email alert?

except KeyboardInterrupt:
pass
3 changes: 3 additions & 0 deletions src/RepairManager/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
kubernetes==10.0.1
requests==2.18.4
PyYAML==5.1.2
13 changes: 13 additions & 0 deletions src/RepairManager/rule-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
rules:
ecc_rule:
module_name : Rules.ecc_rule
class_name : ECCRule
ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0'

# time to sleep between rule execution
wait_time: 10

# prometheus
prometheus_url: 'http://localhost:9091/prometheus'

6 changes: 6 additions & 0 deletions src/RepairManager/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import os

def cordon_node(node):
output = os.system('kubectl cordon %s --dry-run' % node)
return output