Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ kraken:
kubeconfig_path: /root/.kube/config # Path to kubeconfig
scenarios: # List of policies/chaos scenarios to load
- scenarios/etcd.yml
- scenarios/openshift-kube-apiserver.yml
- scenarios/openshift-kube-apiserver.yml
- scenarios/openshift-apiserver.yml

node_scenarios:
- stop_kubelet_and_crash.yml
cerberus:
cerberus_enabled: False # Enable it when cerberus is previously installed
cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
Expand Down
10 changes: 8 additions & 2 deletions kraken/invoke/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,15 @@

# Invokes a given command and returns the stdout
def invoke(command):
logging.info('Try invoking ' + command)
try:
output = subprocess.check_output(command, shell=True,
universal_newlines=True)
except Exception:
logging.error("Failed to run %s" % (command))
except Exception as e:
logging.error("Failed to run %s" % (e))
return output


# Invoke oc debug with command
def invoke_debug_helper(node_name, command):
return invoke("oc debug node/" + node_name + ' -- chroot /host ' + command)
7 changes: 5 additions & 2 deletions kraken/kubernetes/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@ def initialize_clients(kubeconfig_path):


# List nodes in the cluster
def list_nodes():
def list_nodes(label_selector=None):
nodes = []
try:
ret = cli.list_node(pretty=True)
if label_selector:
ret = cli.list_node(pretty=True, label_selector=label_selector)
else:
ret = cli.list_node(pretty=True)
except ApiException as e:
logging.error("Exception when calling CoreV1Api->list_node: %s\n" % e)
for node in ret.items:
Expand Down
47 changes: 47 additions & 0 deletions kraken/node_actions/common_node_actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import time
import random
import logging
import kraken.kubernetes.client as kubecli
import kraken.invoke.command as command


def run_and_select_node(scenario_yaml):
for scenario in scenario_yaml['node_scenarios']:
if "node_name" in scenario.keys():
node_names = scenario['node_name']
else:
node_names = kubecli.list_nodes(scenario['label_selector'])
kill_count = 1
if "instance_kill_count" in scenario.keys():
kill_count = int(scenario['instance_kill_count'])
for i in range(kill_count):
# randomly pick node from node names
if "label_selector" in scenario.keys():
node_name = random.choice(node_names)
else:
node_name = node_names[i]
for action in scenario['actions']:
if action == "stop_kubelet":
kubelet_action("stop", node_name)
elif action == "node_crash":
crash_node(node_name)
else:
logging.info("cloud type " + str(scenario['cloud_type']))
timeout = int(scenario['timeout'])
logging.info("Time out" + str(timeout))
time.sleep(timeout)


# Stop the kubelet on one of the nodes
def kubelet_action(action, node_name):
stop_kubelet_response = command.invoke_debug_helper(node_name, "systemctl " + action + ""
" kubelet")
logging.info("Response from invoke " + str(stop_kubelet_response))


# Crash specific node
def crash_node(node_name):
# found for fork bomb -> :(){:|:};:
crash_node_response = command.invoke_debug_helper(node_name,
"dd if=/dev/urandom of=/proc/sysrq-trigger")
logging.info("Crash node " + str(crash_node_response))
13 changes: 13 additions & 0 deletions kraken/node_actions/read_node_scenarios.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import yaml
import logging


def read_file_return_json(node_scenario_files):
node_scenarios = []
for scenario_file in node_scenario_files:
with open(scenario_file, 'r') as f:
scenario_config = yaml.full_load(f)
# could do checking of yaml here
node_scenarios.append(scenario_config)
logging.info('node scenarios ' + str(node_scenarios))
return node_scenarios
16 changes: 16 additions & 0 deletions nodeScenarios/node_scenario_example.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
node_scenarios:
- name: Stop kubelet
actions:
- stop_kubelet
label_selector: node-role.kubernetes.io/master
instance_kill_count: 1
timeout: 20
cloud_type: aws
- name: Fork bomb the node
actions:
- node_crash
node_name:
- ip-10-0-139-241.us-east-2.compute.internal
instance_kill_count: 2
timeout: 20
cloud_type: aws
112 changes: 76 additions & 36 deletions run_kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import kraken.kubernetes.client as kubecli
import kraken.invoke.command as runcommand
import pyfiglet
import kraken.node_actions.read_node_scenarios as read_node_scenarios
import kraken.node_actions.common_node_actions as node_actions


# Main function
Expand All @@ -23,16 +25,18 @@ def main(cfg):
with open(cfg, 'r') as f:
config = yaml.full_load(f)
kubeconfig_path = config["kraken"]["kubeconfig_path"]
scenarios = config["kraken"]["scenarios"]
cerberus_enabled = config["cerberus"]["cerberus_enabled"]
wait_duration = config["tunings"]["wait_duration"]
iterations = config["tunings"]["iterations"]
daemon_mode = config["tunings"]['daemon_mode']
scenarios = config["kraken"].get("scenarios", [])
node_scenario_files = config['kraken'].get("node_scenarios", [])
cerberus_enabled = config["cerberus"].get("cerberus_enabled", False)
wait_duration = config["tunings"].get("wait_duration", 60)
iterations = config["tunings"].get("iterations", 1)
daemon_mode = config["tunings"].get('daemon_mode', False)

# Initialize clients
if not os.path.isfile(kubeconfig_path):
kubeconfig_path = None
logging.info("Initializing client to talk to the Kubernetes cluster")

kubecli.initialize_clients(kubeconfig_path)

# Cluster info
Expand All @@ -41,6 +45,9 @@ def main(cfg):
cluster_info = runcommand.invoke("kubectl cluster-info | awk 'NR==1' | sed -r "
"'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'") # noqa
logging.info("\n%s%s" % (cluster_version, cluster_info))
# might be good to open and read node_scenarios here and make into json
# so that when continuous iterations don't have to keep open/reading
node_scenarios = read_node_scenarios.read_file_return_json(node_scenario_files)

# Initialize the start iteration to 0
iteration = 0
Expand All @@ -58,38 +65,71 @@ def main(cfg):

# Loop to run the chaos starts here
while (int(iteration) < iterations):
# Inject chaos scenarios specified in the config
try:
# Loop to run the scenarios starts here
for scenario in scenarios:
logging.info("Injecting scenario: %s" % (scenario))
runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-ssh-kill"
" --policy-file %s --kubeconfig %s --no-cloud"
" --inventory-kubernetes --headless"
% (scenario, kubeconfig_path))
logging.info("Scenario: %s has been successfully injected!" % (scenario))
if scenarios:
# Inject chaos scenarios specified in the config
try:
# Loop to run the scenarios starts here
for scenario in scenarios:
logging.info("Injecting scenario: %s" % (scenario))
runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-"
"ssh-kill --policy-file %s --kubeconfig %s --no-cloud"
" --inventory-kubernetes --headless"
% (scenario, kubeconfig_path))
logging.info("Scenario: %s has been successfully injected!" % (scenario))

if cerberus_enabled:
cerberus_url = config["cerberus"]["cerberus_url"]
if not cerberus_url:
logging.error("url where Cerberus publishes True/False signal "
"is not provided.")
sys.exit(1)
cerberus_status = requests.get(cerberus_url).content
cerberus_status = True if cerberus_status == b'True' else False
if not cerberus_status:
logging.error("Received a no-go signal from Cerberus, looks like"
" the cluster is unhealthy. Please check the "
"Cerberus report for more details. Test failed.")
sys.exit(1)
else:
logging.info("Received a go signal from Ceberus, the cluster is "
"healthy. Test passed.")
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
except Exception as e:
logging.error("Failed to run scenario: %s. Encountered the following exception:"
" %s " % (scenario, e))
elif node_scenarios:
# Inject chaos scenarios specified in the config
try:
for node_scenario in node_scenarios:
# put inner loop from node actions here?
logging.info("Injecting scenario: %s" % str(node_scenario))
node_actions.run_and_select_node(node_scenario)
logging.info("Scenario: %s has been successfully injected!"
% (node_scenario))
logging.info("Waiting for the specified duration: %s"
% (wait_duration))
time.sleep(wait_duration)
if cerberus_enabled:
cerberus_url = config["cerberus"]["cerberus_url"]
if not cerberus_url:
logging.error("url where Cerberus publishes True/False signal "
"is not provided.")
sys.exit(1)
cerberus_status = requests.get(cerberus_url).content
cerberus_status = True if cerberus_status == b'True' else False
if not cerberus_status:
logging.error("Received a no-go signal from Cerberus, looks like"
" the cluster is unhealthy. Please check the "
"Cerberus report for more details. Test failed.")
sys.exit(1)
else:
logging.info("Received a go signal from Ceberus, the cluster "
"is healthy. Test passed.")

if cerberus_enabled:
cerberus_url = config["cerberus"]["cerberus_url"]
if not cerberus_url:
logging.error("url where Cerberus publishes True/False signal "
"is not provided.")
sys.exit(1)
cerberus_status = requests.get(cerberus_url).content
cerberus_status = True if cerberus_status == b'True' else False
if not cerberus_status:
logging.error("Received a no-go signal from Cerberus, looks like the"
" cluster is unhealthy. Please check the Cerberus report"
" for more details. Test failed.")
sys.exit(1)
else:
logging.info("Received a go signal from Ceberus, the cluster is "
"healthy. Test passed.")
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
except Exception as e:
logging.error("Failed to run scenario: %s. Encountered the following exception: %s"
% (scenario, e))
except Exception as e:
logging.error("Failed to run scenario: %s. Encountered the following "
"exception: %s" % (node_scenario, e))
iteration += 1
else:
logging.error("Cannot find a config at %s, please check" % (cfg))
Expand Down