From 4e4ba5aec504437e4c92891083aa6bc5c86e9ca1 Mon Sep 17 00:00:00 2001
From: Yashashree Suresh <ysuresh@redhat.com>
Date: Thu, 9 Jul 2020 15:28:28 +0530
Subject: [PATCH] Added cluster shut down scenario

This commit adds a scenario to shut down all the nodes including
the masters and restarts them after a specified duration.
---
 README.md                                |  4 +-
 config/config.yaml                       |  2 +
 docs/cluster_shut_down_scenarios.md      |  9 ++++
 run_kraken.py                            | 60 +++++++++++++++++++++++-
 scenarios/cluster_shut_down_scenario.yml |  4 ++
 5 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 docs/cluster_shut_down_scenarios.md
 create mode 100644 scenarios/cluster_shut_down_scenario.yml

diff --git a/README.md b/README.md
index 489fa3634..c1ca2afa0 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,8 @@ Kraken supports pod, node and time/date based scenarios.
 
 - [Time Scenarios](docs/time_scenarios.md)
 
+- [Cluster Shut Down Scenarios](docs/cluster_shut_down_scenarios.md)
+
 ### Kraken scenario pass/fail criteria and report
 It's important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes/OpenShift cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by:
 - Having built in checks for pod and node based scenarios to ensure the expected number of replicas and nodes are up. It also supports running custom scripts with the checks.
@@ -41,4 +43,4 @@ We are always looking for more enhancements, fixes to make it better, any contri
 ### Community
 Key Members(slack_usernames): paigerube14, rook, mffiedler, mohit, dry923, rsevilla, ravi
 * [**#sig-scalability on Kubernetes Slack**](https://kubernetes.slack.com)
-* [**#forum-perfscale on CoreOS Slack**](https://coreos.slack.com)
+* [**#forum-perfscale on CoreOS Slack**](https://coreos.slack.com)
\ No newline at end of file
diff --git a/config/config.yaml b/config/config.yaml
index e54453cbf..a828546e2 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -18,6 +18,8 @@ kraken:
         -   litmus_scenarios:                              # List of litmus scenarios to load
             - - https://hub.litmuschaos.io/api/chaos/1.10.0?file=charts/generic/node-cpu-hog/rbac.yaml
               - scenarios/node_hog_engine.yaml
+        -   cluster_shut_down_scenario:
+            -   scenarios/cluster_shut_down_scenario.yml
 
 cerberus:
     cerberus_enabled: False                                # Enable it when cerberus is previously installed
diff --git a/docs/cluster_shut_down_scenarios.md b/docs/cluster_shut_down_scenarios.md
new file mode 100644
index 000000000..5b60ee973
--- /dev/null
+++ b/docs/cluster_shut_down_scenarios.md
@@ -0,0 +1,9 @@
+#### Kubernetes/OpenShift cluster shut down scenario
+Scenario to shut down all the nodes including the masters and restart them after specified duration. Cluster shut down scenario can be injected by placing the shut_down config file under cluster_shut_down_scenario option in the kraken config. Refer to [cluster_shut_down_scenario](https://github.com/openshift-scale/kraken/blob/master/scenarios/cluster_shut_down_scenario.yml) config file.
+
+```
+cluster_shut_down_scenario:                          # Scenario to stop all the nodes for specified duration and restart the nodes
+  runs: 1                                            # Number of times to execute the cluster_shut_down scenario
+  shut_down_duration: 120                            # duration in seconds to shut down the cluster
+  cloud_type: aws                                    # cloud type on which Kubernetes/OpenShift runs
+```
diff --git a/run_kraken.py b/run_kraken.py
index 78ee6a7bf..dba6151d4 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -12,7 +12,7 @@
 import kraken.invoke.command as runcommand
 import kraken.litmus.common_litmus as common_litmus
 import kraken.node_actions.common_node_functions as nodeaction
-from kraken.node_actions.aws_node_scenarios import aws_node_scenarios
+from kraken.node_actions.aws_node_scenarios import AWS, aws_node_scenarios
 from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios
 from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios
 import kraken.time_actions.common_time_functions as time_actions
@@ -277,6 +277,57 @@ def litmus_scenarios(scenarios_list, config, litmus_namespaces, litmus_uninstall
     return litmus_namespaces
 
 
+# Inject the cluster shut down scenario
+def cluster_shut_down(shut_down_config, config):
+    runs = shut_down_config["runs"]
+    shut_down_duration = shut_down_config["shut_down_duration"]
+    cloud_type = shut_down_config["cloud_type"]
+    if cloud_type == "aws":
+        cloud_object = AWS()
+
+    nodes = set(kubecli.list_nodes())
+    node_id = {}
+    for node in nodes:
+        node_id[node] = cloud_object.get_instance_id(node)
+
+    for _ in range(runs):
+        logging.info("Starting cluster_shut_down scenario injection")
+        for node in nodes:
+            cloud_object.stop_instances(node_id[node])
+        logging.info("Waiting for 250s to shut down all the nodes")
+        time.sleep(250)
+        logging.info("Shutting down the cluster for the specified duration: %s"
+                     % (shut_down_duration))
+        time.sleep(shut_down_duration)
+        logging.info("Restarting the nodes")
+        restarted_nodes = set()
+        stopped_nodes = nodes
+        while restarted_nodes != nodes:
+            for node in stopped_nodes:
+                try:
+                    cloud_object.start_instances(node_id[node])
+                    restarted_nodes.add(node)
+                except Exception:
+                    time.sleep(10)
+                    continue
+            stopped_nodes = nodes - restarted_nodes
+        logging.info("Waiting for 250s to allow cluster component initilization")
+        time.sleep(250)
+        logging.info("Successfully injected cluster_shut_down scenario!")
+        cerberus_integration(config)
+        logging.info("")
+
+
+def cluster_shut_down_scenarios(scenarios_list, config):
+    for shut_down_config in scenarios_list:
+        with open(shut_down_config, 'r') as f:
+            shut_down_config = yaml.full_load(f)
+            shut_down_config = shut_down_config["cluster_shut_down_scenario"]
+            cluster_shut_down(shut_down_config, config)
+            logging.info("Waiting for the specified duration: %s" % (wait_duration))
+            time.sleep(wait_duration)
+
+
 # Main function
 def main(cfg):
     # Start kraken
@@ -329,6 +380,7 @@ def main(cfg):
         failed_post_scenarios = []
         litmus_namespaces = []
         litmus_installed = False
+        
         # Loop to run the chaos starts here
         while (int(iteration) < iterations):
             # Inject chaos scenarios specified in the config
@@ -350,6 +402,7 @@ def main(cfg):
                         # Inject time skew chaos scenarios specified in the config
                         elif scenario_type == "time_scenarios":
                             time_scenarios(scenarios_list, config)
+
                         elif scenario_type == "litmus_scenarios":
                             if not litmus_installed:
                                 common_litmus.install_litmus(litmus_version)
@@ -359,8 +412,13 @@ def main(cfg):
                                                                  litmus_namespaces,
                                                                  litmus_uninstall)
 
+                        # Inject cluster shut down scenario specified in the config
+                        elif scenario_type == "cluster_shut_down_scenarios":
+                            cluster_shut_down_scenarios(scenarios_list, config)
+
             iteration += 1
             logging.info("")
+
         if litmus_uninstall and litmus_installed:
             for namespace in litmus_namespaces:
                 common_litmus.delete_chaos(namespace)
diff --git a/scenarios/cluster_shut_down_scenario.yml b/scenarios/cluster_shut_down_scenario.yml
new file mode 100644
index 000000000..11e9ca4b3
--- /dev/null
+++ b/scenarios/cluster_shut_down_scenario.yml
@@ -0,0 +1,4 @@
+cluster_shut_down_scenario:                          # Scenario to stop all the nodes for specified duration and restart the nodes
+  runs: 1                                            # Number of times to execute the cluster_shut_down scenario
+  shut_down_duration: 120                            # duration in seconds to shut down the cluster
+  cloud_type: aws                                    # cloud type on which Kubernetes/OpenShift runs