From 4e4ba5aec504437e4c92891083aa6bc5c86e9ca1 Mon Sep 17 00:00:00 2001 From: Yashashree Suresh Date: Thu, 9 Jul 2020 15:28:28 +0530 Subject: [PATCH] Added cluster shut down scenario This commit adds a scenario to shut down all the nodes including the masters and restarts them after a specified duration. --- README.md | 4 +- config/config.yaml | 2 + docs/cluster_shut_down_scenarios.md | 9 ++++ run_kraken.py | 60 +++++++++++++++++++++++- scenarios/cluster_shut_down_scenario.yml | 4 ++ 5 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 docs/cluster_shut_down_scenarios.md create mode 100644 scenarios/cluster_shut_down_scenario.yml diff --git a/README.md b/README.md index 489fa3634..c1ca2afa0 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,8 @@ Kraken supports pod, node and time/date based scenarios. - [Time Scenarios](docs/time_scenarios.md) +- [Cluster Shut Down Scenarios](docs/cluster_shut_down_scenarios.md) + ### Kraken scenario pass/fail criteria and report It's important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes/OpenShift cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by: - Having built in checks for pod and node based scenarios to ensure the expected number of replicas and nodes are up. It also supports running custom scripts with the checks. @@ -41,4 +43,4 @@ We are always looking for more enhancements, fixes to make it better, any contri ### Community Key Members(slack_usernames): paigerube14, rook, mffiedler, mohit, dry923, rsevilla, ravi * [**#sig-scalability on Kubernetes Slack**](https://kubernetes.slack.com) -* [**#forum-perfscale on CoreOS Slack**](https://coreos.slack.com) +* [**#forum-perfscale on CoreOS Slack**](https://coreos.slack.com) \ No newline at end of file diff --git a/config/config.yaml b/config/config.yaml index e54453cbf..a828546e2 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -18,6 +18,8 @@ kraken: - litmus_scenarios: # List of litmus scenarios to load - - https://hub.litmuschaos.io/api/chaos/1.10.0?file=charts/generic/node-cpu-hog/rbac.yaml - scenarios/node_hog_engine.yaml + - cluster_shut_down_scenario: + - scenarios/cluster_shut_down_scenario.yml cerberus: cerberus_enabled: False # Enable it when cerberus is previously installed diff --git a/docs/cluster_shut_down_scenarios.md b/docs/cluster_shut_down_scenarios.md new file mode 100644 index 000000000..5b60ee973 --- /dev/null +++ b/docs/cluster_shut_down_scenarios.md @@ -0,0 +1,9 @@ +#### Kubernetes/OpenShift cluster shut down scenario +Scenario to shut down all the nodes including the masters and restart them after specified duration. Cluster shut down scenario can be injected by placing the shut_down config file under cluster_shut_down_scenario option in the kraken config. Refer to [cluster_shut_down_scenario](https://github.com/openshift-scale/kraken/blob/master/scenarios/cluster_shut_down_scenario.yml) config file. + +``` +cluster_shut_down_scenario: # Scenario to stop all the nodes for specified duration and restart the nodes + runs: 1 # Number of times to execute the cluster_shut_down scenario + shut_down_duration: 120 # duration in seconds to shut down the cluster + cloud_type: aws # cloud type on which Kubernetes/OpenShift runs +``` diff --git a/run_kraken.py b/run_kraken.py index 78ee6a7bf..dba6151d4 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -12,7 +12,7 @@ import kraken.invoke.command as runcommand import kraken.litmus.common_litmus as common_litmus import kraken.node_actions.common_node_functions as nodeaction -from kraken.node_actions.aws_node_scenarios import aws_node_scenarios +from kraken.node_actions.aws_node_scenarios import AWS, aws_node_scenarios from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios import kraken.time_actions.common_time_functions as time_actions @@ -277,6 +277,57 @@ def litmus_scenarios(scenarios_list, config, litmus_namespaces, litmus_uninstall return litmus_namespaces +# Inject the cluster shut down scenario +def cluster_shut_down(shut_down_config, config): + runs = shut_down_config["runs"] + shut_down_duration = shut_down_config["shut_down_duration"] + cloud_type = shut_down_config["cloud_type"] + if cloud_type == "aws": + cloud_object = AWS() + + nodes = set(kubecli.list_nodes()) + node_id = {} + for node in nodes: + node_id[node] = cloud_object.get_instance_id(node) + + for _ in range(runs): + logging.info("Starting cluster_shut_down scenario injection") + for node in nodes: + cloud_object.stop_instances(node_id[node]) + logging.info("Waiting for 250s to shut down all the nodes") + time.sleep(250) + logging.info("Shutting down the cluster for the specified duration: %s" + % (shut_down_duration)) + time.sleep(shut_down_duration) + logging.info("Restarting the nodes") + restarted_nodes = set() + stopped_nodes = nodes + while restarted_nodes != nodes: + for node in stopped_nodes: + try: + cloud_object.start_instances(node_id[node]) + restarted_nodes.add(node) + except Exception: + time.sleep(10) + continue + stopped_nodes = nodes - restarted_nodes + logging.info("Waiting for 250s to allow cluster component initilization") + time.sleep(250) + logging.info("Successfully injected cluster_shut_down scenario!") + cerberus_integration(config) + logging.info("") + + +def cluster_shut_down_scenarios(scenarios_list, config): + for shut_down_config in scenarios_list: + with open(shut_down_config, 'r') as f: + shut_down_config = yaml.full_load(f) + shut_down_config = shut_down_config["cluster_shut_down_scenario"] + cluster_shut_down(shut_down_config, config) + logging.info("Waiting for the specified duration: %s" % (wait_duration)) + time.sleep(wait_duration) + + # Main function def main(cfg): # Start kraken @@ -329,6 +380,7 @@ def main(cfg): failed_post_scenarios = [] litmus_namespaces = [] litmus_installed = False + # Loop to run the chaos starts here while (int(iteration) < iterations): # Inject chaos scenarios specified in the config @@ -350,6 +402,7 @@ def main(cfg): # Inject time skew chaos scenarios specified in the config elif scenario_type == "time_scenarios": time_scenarios(scenarios_list, config) + elif scenario_type == "litmus_scenarios": if not litmus_installed: common_litmus.install_litmus(litmus_version) @@ -359,8 +412,13 @@ def main(cfg): litmus_namespaces, litmus_uninstall) + # Inject cluster shut down scenario specified in the config + elif scenario_type == "cluster_shut_down_scenarios": + cluster_shut_down_scenarios(scenarios_list, config) + iteration += 1 logging.info("") + if litmus_uninstall and litmus_installed: for namespace in litmus_namespaces: common_litmus.delete_chaos(namespace) diff --git a/scenarios/cluster_shut_down_scenario.yml b/scenarios/cluster_shut_down_scenario.yml new file mode 100644 index 000000000..11e9ca4b3 --- /dev/null +++ b/scenarios/cluster_shut_down_scenario.yml @@ -0,0 +1,4 @@ +cluster_shut_down_scenario: # Scenario to stop all the nodes for specified duration and restart the nodes + runs: 1 # Number of times to execute the cluster_shut_down scenario + shut_down_duration: 120 # duration in seconds to shut down the cluster + cloud_type: aws # cloud type on which Kubernetes/OpenShift runs