Skip to content
This repository was archived by the owner on Feb 5, 2020. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
306 changes: 232 additions & 74 deletions tests/rspec/lib/aws_cluster.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# frozen_string_literal: true

require 'cluster'
require 'aws_region'
require 'json'
require 'jenkins'
Expand All @@ -11,9 +10,20 @@
require 'tfstate_file'
require 'fileutils'
require 'with_retries'
require 'cluster_support'
require 'kubectl_helpers'
require 'name_generator'
require 'password_generator'
require 'securerandom'
require 'ssh'
require 'tfvars_file'
require 'config_file'
require 'timeout'
require 'with_retries'
require 'open3'

# AWSCluster represents a k8s cluster on AWS cloud provider
class AwsCluster < Cluster
class AwsCluster
TIMEOUT_IN_SECONDS = (30 * 60).freeze # 30 minutes

attr_reader :config_file, :kubeconfig, :manifest_path, :build_path,
Expand Down Expand Up @@ -52,8 +62,79 @@ def initialize(config_file)
@tfstate['topology'] = TFStateFile.new(@build_path, 'topology.tfstate')
end

def start
apply
wait_til_ready
end

def init
env = env_variables
env['TF_INIT_OPTIONS'] = '-no-color'

run_tectonic_cli(env, 'init', '--config=config.yaml')
# The config within the build folder is the source of truth after init
@config_file = ConfigFile.new(File.expand_path("#{@name}/config.yaml"))
end

def update_cluster
start
end

def stop
if ENV['TF_VAR_tectonic_aws_ssh_key'].include?('rspec-')
AwsSupport.delete_aws_key_pairs(ENV['TF_VAR_tectonic_aws_ssh_key'], @aws_region, @role_credentials)
end

if ENV.key?('TECTONIC_TESTS_DONT_CLEAN_UP')
puts "*** Cleanup inhibiting flag set. Stopping here. ***\n"
puts '*** Your email/password to use in the tectonic console is:'\
"#{@tectonic_admin_email} / #{@tectonic_admin_password} ***\n"
return
end
destroy
end

def check_prerequisites
raise 'AWS credentials not defined' unless credentials_defined?
raise 'TF_VAR_tectonic_aws_ssh_key is not defined' unless ssh_key_defined?
raise 'TF_VAR_tectonic_aws_region is not defined' unless region_defined?

return if license_and_pull_secret_defined?
raise 'Tectonic license and pull secret are not defined as environment'\
'variables.'
end

def region_defined?
EnvVar.set?(%w[TF_VAR_tectonic_aws_region])
end

def credentials_defined?
credential_names = %w[AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY]
profile_name = %w[AWS_PROFILE]
session_token = %w[
AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY
AWS_SESSION_TOKEN
]
EnvVar.set?(credential_names) ||
EnvVar.set?(profile_name) ||
EnvVar.set?(session_token)
end

def ssh_key_defined?
EnvVar.set?(%w[TF_VAR_tectonic_aws_ssh_key])
end

def recover_from_failed_destroy
Grafiti.new(@build_path, ENV['TF_VAR_tectonic_aws_region']).clean
end

def env_variables
variables = super
variables = {}
variables['CLUSTER'] = @name
variables['TF_VAR_tectonic_cluster_name'] = @name
variables['TF_VAR_tectonic_admin_email'] = @tectonic_admin_email
variables['TF_VAR_tectonic_admin_password'] = @tectonic_admin_password
variables['PLATFORM'] = 'aws'
variables['TF_VAR_tectonic_cluster_name'] = @config_file.cluster_name
variables['CLUSTER'] = @config_file.cluster_name
Expand All @@ -67,12 +148,67 @@ def env_variables
variables
end

def stop
if ENV['TF_VAR_tectonic_aws_ssh_key'].include?('rspec-')
AwsSupport.delete_aws_key_pairs(ENV['TF_VAR_tectonic_aws_ssh_key'], @aws_region, @role_credentials)
def secret_files(namespace, secret)
cmd = "get secret -n #{namespace} #{secret} -o go-template "\
"\'--template={{range $key, $value := .data}}{{$key}}\n{{end}}\'"
KubeCTL.run(@kubeconfig, cmd).split("\n")
end

def api_ip_addresses
nodes = KubeCTL.run(
@kubeconfig,
'get node -l=node-role.kubernetes.io/master '\
'-o jsonpath=\'{range .items[*]}'\
'{@.metadata.name}{"\t"}{@.status.addresses[?(@.type=="ExternalIP")].address}'\
'{"\n"}{end}\''
)

nodes = nodes.split("\n").map { |node| node.split("\t") }.to_h

api_pods = KubeCTL.run(
@kubeconfig,
'get pod -n kube-system -l k8s-app=kube-apiserver '\
'-o \'jsonpath={range .items[*]}'\
'{@.metadata.name}{"\t"}{@.spec.nodeName}'\
'{"\n"}{end}\''
)

api_pods
.split("\n")
.map { |pod| pod.split("\t") }
.map { |pod| [pod[0], nodes[pod[1]]] }.to_h
end

def forensic(events = true)
outputs_console_logs = machine_boot_console_logs
outputs_console_logs.each do |ip, log|
puts "saving boot logs from master-#{ip}"
save_to_file(@name, 'console_machine', ip, 'console_machine', log)
end

save_kubernetes_events(@kubeconfig, @name) if events

master_ip_addresses.each do |master_ip|
save_docker_logs(master_ip, @name)

['bootkube', 'tectonic', 'kubelet', 'k8s-node-bootstrap'].each do |service|
print_service_logs(master_ip, service, @name)
end
end

super
worker_ip_addresses.each do |worker_ip|
save_docker_logs(worker_ip, @name, master_ip_address)

['kubelet'].each do |service|
print_service_logs(worker_ip, service, @name, master_ip_address)
end
end

etcd_ip_addresses.each do |etcd_ip|
['etcd-member'].each do |service|
print_service_logs(etcd_ip, service, @name, master_ip_address)
end
end
end

def machine_boot_console_logs
Expand Down Expand Up @@ -115,63 +251,13 @@ def etcd_ip_addresses
@tfstate['etcd'].output('etcd', 'ip_addresses')
end

def check_prerequisites
raise 'AWS credentials not defined' unless credentials_defined?
raise 'TF_VAR_tectonic_aws_ssh_key is not defined' unless ssh_key_defined?
raise 'TF_VAR_tectonic_aws_region is not defined' unless region_defined?

super
end

def region_defined?
EnvVar.set?(%w[TF_VAR_tectonic_aws_region])
end

def credentials_defined?
credential_names = %w[AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY]
profile_name = %w[AWS_PROFILE]
session_token = %w[
AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY
AWS_SESSION_TOKEN
]
EnvVar.set?(credential_names) ||
EnvVar.set?(profile_name) ||
EnvVar.set?(session_token)
end

def ssh_key_defined?
EnvVar.set?(%w[TF_VAR_tectonic_aws_ssh_key])
end

def recover_from_failed_destroy
Grafiti.new(@build_path, ENV['TF_VAR_tectonic_aws_region']).clean
super
end

def tectonic_console_url
Dir.chdir(@build_path) do
ingress_ext = @tfstate['topology'].output('dns', 'ingress_external_fqdn')
ingress_int = @tfstate['topology'].output('dns', 'ingress_internal_fqdn')
if ingress_ext.empty?
if ingress_int.empty?
raise 'failed to get the console url to use in the UI tests.'
end
return ingress_int
end
ingress_ext
end
end

# TODO: Remove once other platforms caught up
private

def init
env = env_variables
env['TF_INIT_OPTIONS'] = '-no-color'
def license_and_pull_secret_defined?
license_path = 'TF_VAR_tectonic_license_path'
pull_secret_path = 'TF_VAR_tectonic_pull_secret_path'

run_tectonic_cli(env, 'init', '--config=config.yaml')
# The config within the build folder is the source of truth after init
@config_file = ConfigFile.new(File.expand_path("#{@name}/config.yaml"))
EnvVar.set?([license_path, pull_secret_path])
end

def apply
Expand Down Expand Up @@ -276,18 +362,6 @@ def tf_value(v)
end
end

private

# def destroy
# # For debugging purposes (see: https://github.com/terraform-providers/terraform-provider-aws/pull/1051)
# describe_network_interfaces

# super

# # For debugging purposes (see: https://github.com/terraform-providers/terraform-provider-aws/pull/1051)
# describe_network_interfaces
# end

def describe_network_interfaces
puts 'describing network interfaces for debugging purposes'
vpc_id = @tfstate['topology'].value('module.vpc.aws_vpc.cluster_vpc', 'id')
Expand All @@ -310,4 +384,88 @@ def save_terraform_logs(tectonic_logs, output)
save_to_file << output
save_to_file.close
end

def wait_til_ready
sleep_wait_for_reboot
wait_for_bootstrapping

from = Time.now
loop do
begin
KubeCTL.run(@kubeconfig, 'cluster-info')
break
rescue KubeCTL::KubeCTLCmdError
elapsed = Time.now - from
raise 'kubectl cluster-info never returned with successful error code' if elapsed > 1200 # 20 mins timeout
sleep 10
end
end

wait_nodes_ready
end

def wait_for_bootstrapping
ips = master_ip_addresses
raise 'Empty master ips. Aborting...' if ips.empty?
wait_for_service('bootkube', ips)
wait_for_service('tectonic', ips)
puts 'HOORAY! The cluster is up'
end

# Adding this sleep to wait for some time before we start ssh into the server
# if we ssh during the reboot from torcx this might put the shutdown in a weird state
# and that's might be the reason why we saw several connection timeouts in tests while spinning up a cluster
def sleep_wait_for_reboot
from = Time.now
loop do
elapsed = Time.now - from
puts "Sleeping for 5 minutes. Remaining #{300 - elapsed} seconds. Giving some time to the server reboot."
sleep 30
break if elapsed > 300 # 5 mins timeout
end
puts 'Done. Lets check the cluster now...'
end

def wait_for_service(service, ips)
from = Time.now

::Timeout.timeout(30 * 60) do # 30 minutes
loop do
return if service_finished_bootstrapping?(ips, service)

elapsed = Time.now - from
if (elapsed.round % 5).zero?
puts "Waiting for bootstrapping of #{service} service to complete..."
puts "Checked master nodes: #{ips}"
end
sleep 10
end
end
rescue Timeout::Error
puts 'Trying to collecting the logs...'
forensic(false) # Call forensic to collect logs when service timeout
raise "timeout waiting for #{service} service to bootstrap on any of: #{ips}"
end

def service_finished_bootstrapping?(ips, service)
command = "systemctl is-active #{service} --quiet && [ $(systemctl show -p SubState --value #{service}) == \"exited\" ]"
ips.each do |ip|
finished = 1
begin
_, _, finished = ssh_exec(ip, command)
rescue => e
puts "failed to ssh exec on ip #{ip} with: #{e}"
end

if finished.zero?
puts "#{service} service finished successfully on ip #{ip}"
return true
end
end
false
end

def describe_nodes
KubeCTL.run_and_parse(@kubeconfig, 'get nodes')
end
end
Loading