diff --git a/config/config.yaml b/config/config.yaml index 7d43c725f..3aec6c37e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -2,9 +2,10 @@ kraken: kubeconfig_path: /root/.kube/config # Path to kubeconfig scenarios: # List of policies/chaos scenarios to load - scenarios/etcd.yml - - scenarios/openshift-kube-apiserver.yml + - scenarios/openshift-kube-apiserver.yml - scenarios/openshift-apiserver.yml - + node_scenarios: + - stop_kubelet_and_crash.yml cerberus: cerberus_enabled: False # Enable it when cerberus is previously installed cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal diff --git a/kraken/invoke/command.py b/kraken/invoke/command.py index 152875ff0..46c715829 100644 --- a/kraken/invoke/command.py +++ b/kraken/invoke/command.py @@ -4,9 +4,15 @@ # Invokes a given command and returns the stdout def invoke(command): + logging.info('Try invoking ' + command) try: output = subprocess.check_output(command, shell=True, universal_newlines=True) - except Exception: - logging.error("Failed to run %s" % (command)) + except Exception as e: + logging.error("Failed to run %s" % (e)) return output + + +# Invoke oc debug with command +def invoke_debug_helper(node_name, command): + return invoke("oc debug node/" + node_name + ' -- chroot /host ' + command) diff --git a/kraken/kubernetes/client.py b/kraken/kubernetes/client.py index 657b4bffe..ccdd51dbe 100644 --- a/kraken/kubernetes/client.py +++ b/kraken/kubernetes/client.py @@ -11,10 +11,13 @@ def initialize_clients(kubeconfig_path): # List nodes in the cluster -def list_nodes(): +def list_nodes(label_selector=None): nodes = [] try: - ret = cli.list_node(pretty=True) + if label_selector: + ret = cli.list_node(pretty=True, label_selector=label_selector) + else: + ret = cli.list_node(pretty=True) except ApiException as e: logging.error("Exception when calling CoreV1Api->list_node: %s\n" % e) for node in ret.items: diff --git a/kraken/node_actions/common_node_actions.py b/kraken/node_actions/common_node_actions.py new file mode 100644 index 000000000..9432d2191 --- /dev/null +++ b/kraken/node_actions/common_node_actions.py @@ -0,0 +1,47 @@ +import time +import random +import logging +import kraken.kubernetes.client as kubecli +import kraken.invoke.command as command + + +def run_and_select_node(scenario_yaml): + for scenario in scenario_yaml['node_scenarios']: + if "node_name" in scenario.keys(): + node_names = scenario['node_name'] + else: + node_names = kubecli.list_nodes(scenario['label_selector']) + kill_count = 1 + if "instance_kill_count" in scenario.keys(): + kill_count = int(scenario['instance_kill_count']) + for i in range(kill_count): + # randomly pick node from node names + if "label_selector" in scenario.keys(): + node_name = random.choice(node_names) + else: + node_name = node_names[i] + for action in scenario['actions']: + if action == "stop_kubelet": + kubelet_action("stop", node_name) + elif action == "node_crash": + crash_node(node_name) + else: + logging.info("cloud type " + str(scenario['cloud_type'])) + timeout = int(scenario['timeout']) + logging.info("Time out" + str(timeout)) + time.sleep(timeout) + + +# Stop the kubelet on one of the nodes +def kubelet_action(action, node_name): + stop_kubelet_response = command.invoke_debug_helper(node_name, "systemctl " + action + "" + " kubelet") + logging.info("Response from invoke " + str(stop_kubelet_response)) + + +# Crash specific node +def crash_node(node_name): + # found for fork bomb -> :(){:|:};: + crash_node_response = command.invoke_debug_helper(node_name, + "dd if=/dev/urandom of=/proc/sysrq-trigger") + logging.info("Crash node " + str(crash_node_response)) diff --git a/kraken/node_actions/read_node_scenarios.py b/kraken/node_actions/read_node_scenarios.py new file mode 100644 index 000000000..354f8a45c --- /dev/null +++ b/kraken/node_actions/read_node_scenarios.py @@ -0,0 +1,13 @@ +import yaml +import logging + + +def read_file_return_json(node_scenario_files): + node_scenarios = [] + for scenario_file in node_scenario_files: + with open(scenario_file, 'r') as f: + scenario_config = yaml.full_load(f) + # could do checking of yaml here + node_scenarios.append(scenario_config) + logging.info('node scenarios ' + str(node_scenarios)) + return node_scenarios diff --git a/nodeScenarios/node_scenario_example.yml b/nodeScenarios/node_scenario_example.yml new file mode 100644 index 000000000..135d4c62e --- /dev/null +++ b/nodeScenarios/node_scenario_example.yml @@ -0,0 +1,16 @@ +node_scenarios: +- name: Stop kubelet + actions: + - stop_kubelet + label_selector: node-role.kubernetes.io/master + instance_kill_count: 1 + timeout: 20 + cloud_type: aws +- name: Fork bomb the node + actions: + - node_crash + node_name: + - ip-10-0-139-241.us-east-2.compute.internal + instance_kill_count: 2 + timeout: 20 + cloud_type: aws \ No newline at end of file diff --git a/run_kraken.py b/run_kraken.py index ef860f855..f8e3ebf13 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -10,6 +10,8 @@ import kraken.kubernetes.client as kubecli import kraken.invoke.command as runcommand import pyfiglet +import kraken.node_actions.read_node_scenarios as read_node_scenarios +import kraken.node_actions.common_node_actions as node_actions # Main function @@ -23,16 +25,18 @@ def main(cfg): with open(cfg, 'r') as f: config = yaml.full_load(f) kubeconfig_path = config["kraken"]["kubeconfig_path"] - scenarios = config["kraken"]["scenarios"] - cerberus_enabled = config["cerberus"]["cerberus_enabled"] - wait_duration = config["tunings"]["wait_duration"] - iterations = config["tunings"]["iterations"] - daemon_mode = config["tunings"]['daemon_mode'] + scenarios = config["kraken"].get("scenarios", []) + node_scenario_files = config['kraken'].get("node_scenarios", []) + cerberus_enabled = config["cerberus"].get("cerberus_enabled", False) + wait_duration = config["tunings"].get("wait_duration", 60) + iterations = config["tunings"].get("iterations", 1) + daemon_mode = config["tunings"].get('daemon_mode', False) # Initialize clients if not os.path.isfile(kubeconfig_path): kubeconfig_path = None logging.info("Initializing client to talk to the Kubernetes cluster") + kubecli.initialize_clients(kubeconfig_path) # Cluster info @@ -41,6 +45,9 @@ def main(cfg): cluster_info = runcommand.invoke("kubectl cluster-info | awk 'NR==1' | sed -r " "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'") # noqa logging.info("\n%s%s" % (cluster_version, cluster_info)) + # might be good to open and read node_scenarios here and make into json + # so that when continuous iterations don't have to keep open/reading + node_scenarios = read_node_scenarios.read_file_return_json(node_scenario_files) # Initialize the start iteration to 0 iteration = 0 @@ -58,38 +65,71 @@ def main(cfg): # Loop to run the chaos starts here while (int(iteration) < iterations): - # Inject chaos scenarios specified in the config - try: - # Loop to run the scenarios starts here - for scenario in scenarios: - logging.info("Injecting scenario: %s" % (scenario)) - runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-ssh-kill" - " --policy-file %s --kubeconfig %s --no-cloud" - " --inventory-kubernetes --headless" - % (scenario, kubeconfig_path)) - logging.info("Scenario: %s has been successfully injected!" % (scenario)) + if scenarios: + # Inject chaos scenarios specified in the config + try: + # Loop to run the scenarios starts here + for scenario in scenarios: + logging.info("Injecting scenario: %s" % (scenario)) + runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-" + "ssh-kill --policy-file %s --kubeconfig %s --no-cloud" + " --inventory-kubernetes --headless" + % (scenario, kubeconfig_path)) + logging.info("Scenario: %s has been successfully injected!" % (scenario)) + + if cerberus_enabled: + cerberus_url = config["cerberus"]["cerberus_url"] + if not cerberus_url: + logging.error("url where Cerberus publishes True/False signal " + "is not provided.") + sys.exit(1) + cerberus_status = requests.get(cerberus_url).content + cerberus_status = True if cerberus_status == b'True' else False + if not cerberus_status: + logging.error("Received a no-go signal from Cerberus, looks like" + " the cluster is unhealthy. Please check the " + "Cerberus report for more details. Test failed.") + sys.exit(1) + else: + logging.info("Received a go signal from Ceberus, the cluster is " + "healthy. Test passed.") + logging.info("Waiting for the specified duration: %s" % (wait_duration)) + time.sleep(wait_duration) + except Exception as e: + logging.error("Failed to run scenario: %s. Encountered the following exception:" + " %s " % (scenario, e)) + elif node_scenarios: + # Inject chaos scenarios specified in the config + try: + for node_scenario in node_scenarios: + # put inner loop from node actions here? + logging.info("Injecting scenario: %s" % str(node_scenario)) + node_actions.run_and_select_node(node_scenario) + logging.info("Scenario: %s has been successfully injected!" + % (node_scenario)) + logging.info("Waiting for the specified duration: %s" + % (wait_duration)) + time.sleep(wait_duration) + if cerberus_enabled: + cerberus_url = config["cerberus"]["cerberus_url"] + if not cerberus_url: + logging.error("url where Cerberus publishes True/False signal " + "is not provided.") + sys.exit(1) + cerberus_status = requests.get(cerberus_url).content + cerberus_status = True if cerberus_status == b'True' else False + if not cerberus_status: + logging.error("Received a no-go signal from Cerberus, looks like" + " the cluster is unhealthy. Please check the " + "Cerberus report for more details. Test failed.") + sys.exit(1) + else: + logging.info("Received a go signal from Ceberus, the cluster " + "is healthy. Test passed.") - if cerberus_enabled: - cerberus_url = config["cerberus"]["cerberus_url"] - if not cerberus_url: - logging.error("url where Cerberus publishes True/False signal " - "is not provided.") - sys.exit(1) - cerberus_status = requests.get(cerberus_url).content - cerberus_status = True if cerberus_status == b'True' else False - if not cerberus_status: - logging.error("Received a no-go signal from Cerberus, looks like the" - " cluster is unhealthy. Please check the Cerberus report" - " for more details. Test failed.") - sys.exit(1) - else: - logging.info("Received a go signal from Ceberus, the cluster is " - "healthy. Test passed.") - logging.info("Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - except Exception as e: - logging.error("Failed to run scenario: %s. Encountered the following exception: %s" - % (scenario, e)) + except Exception as e: + logging.error("Failed to run scenario: %s. Encountered the following " + "exception: %s" % (node_scenario, e)) iteration += 1 else: logging.error("Cannot find a config at %s, please check" % (cfg))