krkn-chaos · mffiedler · Nov 17, 2020 · Oct 27, 2020
diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md
@@ -13,11 +13,22 @@ Following node chaos scenarios are supported:
 
 **NOTE**: If the node doesn't recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.
 
-**NOTE**: node_start_scenario, node_stop_scenario, node_stop_start_scenario, node_termination_scenario, node_reboot_scenario and stop_start_kubelet_scenario are supported only on AWS as of now.
+**NOTE**: node_start_scenario, node_stop_scenario, node_stop_start_scenario, node_termination_scenario, node_reboot_scenario and stop_start_kubelet_scenario are supported only on AWS and GCP as of now.
 
-**NOTE**: AWS is the only cloud platform supported as of today but we are looking into adding more. Make sure [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) is installed and properly [configured](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html) using an AWS account
+####AWS 
 
-**NOTE**: The `stop_start_kubelet_scenario` and `node_crash_scenario` scenarios are supported as they are independent of the cloud platform
+**NOTE**: For clusters with AWS make sure [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) is installed and properly [configured](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html) using an AWS account
+
+####GCP
+**NOTE**: For clusters with GCP make sure [GCP CLI](https://cloud.google.com/sdk/docs/install#linux) is installed.
+
+A google service account is required to give proper authentication to GCP for node actions. See [here](https://cloud.google.com/docs/authentication/getting-started) for how to create a service account.
+
+**NOTE**: A user with 'resourcemanager.projects.setIamPolicy' permission is required to grant project-level permissions to the service account.
+
+After creating the service account you'll need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS="<serviceaccount.json>"```
+
+**NOTE**: The `stop_start_kubelet_scenario` and `node_crash_scenario` scenarios are supported as they are independent of the cloud platform.
 
 
 Node scenarios can be injected by placing the node scenarios config files under node_scenarios option in the kraken config. Refer to [node_scenarios_example](https://github.com/openshift-scale/kraken/blob/master/scenarios/node_scenarios_example.yml) config file.

diff --git a/kraken/node_actions/gcp_node_scenarios.py b/kraken/node_actions/gcp_node_scenarios.py
@@ -0,0 +1,182 @@
+import sys
+import time
+import logging
+import kraken.kubernetes.client as kubecli
+import kraken.node_actions.common_node_functions as nodeaction
+from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
+from googleapiclient import discovery
+from oauth2client.client import GoogleCredentials
+import kraken.invoke.command as runcommand
+
+
+class GCP:
+    def __init__(self):
+
+        self.project = runcommand.invoke('gcloud config get-value project').split('/n')[0].strip()
+        logging.info("project " + str(self.project) + "!")
+        credentials = GoogleCredentials.get_application_default()
+        self.client = discovery.build('compute', 'v1', credentials=credentials, cache_discovery=False)
+
+    # Get the instance ID of the node
+    def get_instance_id(self, node):
+        zone_request = self.client.zones().list(project=self.project)
+        while zone_request is not None:
+            zone_response = zone_request.execute()
+            for zone in zone_response['items']:
+                instances_request = self.client.instances().list(project=self.project, zone=zone['name'])
+                while instances_request is not None:
+                    instance_response = instances_request.execute()
+                    if "items" in instance_response.keys():
+                        for instance in instance_response['items']:
+                            if instance['name'] in node:
+                                return instance['name'], zone['name']
+                    instances_request = self.client.zones().list_next(previous_request=instances_request,
+                                                                          previous_response=instance_response)
+            zone_request = self.client.zones().list_next(previous_request=zone_request, previous_response=zone_response)
+        logging.info('no instances ')
+
+    # Start the node instance
+    def start_instances(self, zone, instance_id):
+        self.client.instances().start(project=self.project, zone=zone, instance=instance_id).execute()
+
+    # Stop the node instance
+    def stop_instances(self, zone, instance_id):
+        self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute()
+
+    # Start the node instance
+    def suspend_instances(self, zone, instance_id):
+        self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id).execute()
+
+    # Stop the node instance
+    def stop_instances(self, zone, instance_id):
+        self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute()
+
+    # Terminate the node instance
+    def terminate_instances(self, zone, instance_id):
+        self.client.instances().delete(project=self.project, zone=zone, instance=instance_id).execute()
+
+    # Reboot the node instance
+    def reboot_instances(self, zone, instance_id):
+        response = self.client.instances().reset(project=self.project, zone=zone, instance=instance_id).execute()
+        logging.info('response reboot ' + str(response))
+
+    # Get instance status
+    def get_instance_status(self, zone, instance_id, expected_status, timeout):
+        # statuses: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING, and TERMINATED.
+        i = 0
+        sleeper = 5
+        while i <= timeout:
+            instStatus = self.client.instances().get(project=self.project, zone=zone, instance=instance_id).execute()
+            logging.info("Status of vm " + str(instStatus['status']))
+            if instStatus['status'] == expected_status:
+                return True
+            time.sleep(sleeper)
+            i += sleeper
+        logging.error("Status of %s was not %s in a")
+
+    # Wait until the node instance is suspended
+    def wait_until_suspended(self, zone, instance_id, timeout):
+        self.get_instance_status(zone, instance_id, 'SUSPENDED', timeout)
+
+    # Wait until the node instance is running
+    def wait_until_running(self, zone, instance_id, timeout):
+        self.get_instance_status(zone, instance_id, 'RUNNING', timeout)
+
+    # Wait until the node instance is stopped
+    def wait_until_stopped(self, zone, instance_id, timeout):
+        self.get_instance_status(zone, instance_id, 'TERMINATED', timeout)
+
+    # Wait until the node instance is terminated
+    def wait_until_terminated(self, zone, instance_id, timeout):
+        try:
+            i = 0
+            sleeper = 5
+            while i <= timeout:
+                instStatus = self.client.instances().get(project=self.project, zone=zone,
+                                                         instance=instance_id).execute()
+                logging.info("Status of vm " + str(instStatus['status']))
+                time.sleep(sleeper)
+        except Exception as e:
+            logging.info("here " + str(e))
+            return True
+
+
+class gcp_node_scenarios(abstract_node_scenarios):
+    def __init__(self):
+        self.gcp = GCP()
+
+    # Node scenario to start the node
+    def node_start_scenario(self, instance_kill_count, node, timeout):
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Starting node_start_scenario injection")
+                instance_id, zone = self.gcp.get_instance_id(node)
+                logging.info("Starting the node %s with instance ID: %s " % (node, instance_id))
+                self.gcp.start_instances(zone, instance_id)
+                self.gcp.wait_until_running(zone, instance_id, timeout)
+                nodeaction.wait_for_ready_status(node, timeout)
+                logging.info("Node with instance ID: %s is in running state" % instance_id)
+                logging.info("node_start_scenario has been successfully injected!")
+            except Exception as e:
+                logging.error("Failed to start node instance. Encountered following "
+                              "exception: %s. Test Failed" % (e))
+                logging.error("node_start_scenario injection failed!")
+                sys.exit(1)
+
+    # Node scenario to stop the node
+    def node_stop_scenario(self, instance_kill_count, node, timeout):
+        logging.info('stop scenario')
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Starting node_stop_scenario injection")
+                instance_id, zone = self.gcp.get_instance_id(node)
+                logging.info("Stopping the node %s with instance ID: %s " % (node, instance_id))
+                self.gcp.stop_instances(zone, instance_id)
+                self.gcp.wait_until_stopped(zone, instance_id, timeout)
+                logging.info("Node with instance ID: %s is in stopped state" % instance_id)
+                nodeaction.wait_for_unknown_status(node, timeout)
+            except Exception as e:
+                logging.error("Failed to stop node instance. Encountered following exception: %s. "
+                              "Test Failed" % (e))
+                logging.error("node_stop_scenario injection failed!")
+                sys.exit(1)
+
+    # Node scenario to terminate the node
+    def node_termination_scenario(self, instance_kill_count, node, timeout):
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Starting node_termination_scenario injection")
+                instance_id, zone = self.gcp.get_instance_id(node)
+                logging.info("Terminating the node %s with instance ID: %s " % (node, instance_id))
+                self.gcp.terminate_instances(zone, instance_id)
+                self.gcp.wait_until_terminated(zone, instance_id, timeout)
+                for _ in range(timeout):
+                    if node not in kubecli.list_nodes():
+                        break
+                    time.sleep(1)
+                if node in kubecli.list_nodes():
+                    raise Exception("Node could not be terminated")
+                logging.info("Node with instance ID: %s has been terminated" % instance_id)
+                logging.info("node_termination_scenario has been successfuly injected!")
+            except Exception as e:
+                logging.error("Failed to terminate node instance. Encountered following exception:"
+                              " %s. Test Failed" % e)
+                logging.error("node_termination_scenario injection failed!")
+                sys.exit(1)
+
+    # Node scenario to reboot the node
+    def node_reboot_scenario(self, instance_kill_count, node, timeout):
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Starting node_reboot_scenario injection")
+                instance_id, zone = self.gcp.get_instance_id(node)
+                logging.info("Rebooting the node %s with instance ID: %s " % (node, instance_id))
+                self.gcp.reboot_instances(zone, instance_id)
+                nodeaction.wait_for_ready_status(node, timeout)
+                logging.info("Node with instance ID: %s has been rebooted" % instance_id)
+                logging.info("node_reboot_scenario has been successfuly injected!")
+            except Exception as e:
+                logging.error("Failed to reboot node instance. Encountered following exception:"
+                              " %s. Test Failed" % (e))
+                logging.error("node_reboot_scenario injection failed!")
+                sys.exit(1)
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ PyYAML
 git+https://github.com/powerfulseal/powerfulseal.git
 requests
 boto3
+google-api-python-client
diff --git a/run_kraken.py b/run_kraken.py
@@ -12,13 +12,16 @@
 import kraken.invoke.command as runcommand
 import kraken.node_actions.common_node_functions as nodeaction
 from kraken.node_actions.aws_node_scenarios import aws_node_scenarios
+from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios
 import kraken.time_actions.common_time_functions as time_actions
 
 
 # Get the node scenarios object of specfied cloud type
 def get_node_scenario_object(node_scenario):
     if node_scenario['cloud_type'] == 'aws':
         return aws_node_scenarios()
+    elif node_scenario['cloud_type'] == 'gcp':
+        return gcp_node_scenarios()
 
 
 # Inject the specified node scenario
@@ -28,10 +31,8 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
     node_name = node_scenario.get("node_name", "")
     label_selector = node_scenario.get("label_selector", "")
     timeout = node_scenario.get("timeout", 120)
-
     # Get the node to apply the scenario
     node = nodeaction.get_node(node_name, label_selector)
-
     if action == "node_start_scenario":
         node_scenario_object.node_start_scenario(instance_kill_count, node, timeout)
     elif action == "node_stop_scenario":