Skip to content

Commit

Permalink
Improve kubectl retries (#476)
Browse files Browse the repository at this point in the history
* Makes kubectl#run accept a whitelist of error types to retry. Initially, template validation uses it to retry client timeouts only.
* Make kubectl#run's retry delay more sophisticated
  • Loading branch information
KnVerey authored May 1, 2019
1 parent 7377fe9 commit ab599ee
Show file tree
Hide file tree
Showing 7 changed files with 231 additions and 56 deletions.
27 changes: 13 additions & 14 deletions lib/kubernetes-deploy/deploy_task.rb
Original file line number Diff line number Diff line change
Expand Up @@ -552,15 +552,20 @@ def confirm_cluster_reachable
end

def confirm_namespace_exists
st, err = nil
with_retries(2) do
_, err, st = kubectl.run("get", "namespace", @namespace, use_namespace: false, log_failure: true)
st.success? || err.include?(KubernetesDeploy::Kubectl::NOT_FOUND_ERROR_TEXT)
end
raise FatalDeploymentError, "Failed to find namespace. #{err}" unless st.success?
raise FatalDeploymentError, "Namespace #{@namespace} not found" unless namespace_definition.present?
@logger.info("Namespace #{@namespace} found")
end

def namespace_definition
@namespace_definition ||= begin
definition, _err, st = kubectl.run("get", "namespace", @namespace, use_namespace: false,
log_failure: true, raise_if_not_found: true, attempts: 3, output: 'json')
st.success? ? JSON.parse(definition, symbolize_names: true) : nil
end
rescue Kubectl::ResourceNotFoundError
nil
end

# make sure to never prune the ejson-keys secret
def confirm_ejson_keys_not_prunable
secret = ejson_provisioner.ejson_keys_secret
Expand All @@ -574,14 +579,8 @@ def confirm_ejson_keys_not_prunable
end

def tags_from_namespace_labels
namespace_info = nil
with_retries(2) do
namespace_info, _, st = kubectl.run("get", "namespace", @namespace, "-o", "json", use_namespace: false,
log_failure: true)
st.success?
end
return [] if namespace_info.blank?
namespace_labels = JSON.parse(namespace_info, symbolize_names: true).fetch(:metadata, {}).fetch(:labels, {})
return [] if namespace_definition.blank?
namespace_labels = namespace_definition.fetch(:metadata, {}).fetch(:labels, {})
namespace_labels.map { |key, value| "#{key}:#{value}" }
end

Expand Down
3 changes: 2 additions & 1 deletion lib/kubernetes-deploy/ejson_secret_provisioner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ def build_secrets
secrets.map do |secret_name, secret_spec|
validate_secret_spec(secret_name, secret_spec)
resource = generate_secret_resource(secret_name, secret_spec["_type"], secret_spec["data"])
unless resource.validate_definition(@kubectl)
resource.validate_definition(@kubectl)
if resource.validation_failed?
raise EjsonSecretError, "Resulting resource Secret/#{secret_name} failed validation"
end
resource
Expand Down
63 changes: 43 additions & 20 deletions lib/kubernetes-deploy/kubectl.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@

module KubernetesDeploy
class Kubectl
ERROR_MATCHERS = {
not_found: /NotFound/,
client_timeout: /Client\.Timeout exceeded while awaiting headers/,
}
DEFAULT_TIMEOUT = 15
NOT_FOUND_ERROR_TEXT = 'NotFound'
MAX_RETRY_DELAY = 16

class ResourceNotFoundError < StandardError; end

Expand All @@ -21,43 +25,45 @@ def initialize(namespace:, context:, logger:, log_failure_by_default:, default_t
end

def run(*args, log_failure: nil, use_context: true, use_namespace: true, output: nil,
raise_if_not_found: false, attempts: 1, output_is_sensitive: nil)
raise_if_not_found: false, attempts: 1, output_is_sensitive: nil, retry_whitelist: nil)
log_failure = @log_failure_by_default if log_failure.nil?
output_is_sensitive = @output_is_sensitive_default if output_is_sensitive.nil?

args = args.unshift("kubectl")
args.push("--namespace=#{@namespace}") if use_namespace
args.push("--context=#{@context}") if use_context
args.push("--output=#{output}") if output
args.push("--request-timeout=#{@default_timeout}") if @default_timeout
cmd = build_command_from_options(args, use_namespace, use_context, output)
out, err, st = nil

(1..attempts).to_a.each do |attempt|
@logger.debug("Running command (attempt #{attempt}): #{args.join(' ')}")
out, err, st = Open3.capture3(*args)
(1..attempts).to_a.each do |current_attempt|
@logger.debug("Running command (attempt #{current_attempt}): #{cmd.join(' ')}")
out, err, st = Open3.capture3(*cmd)
@logger.debug("Kubectl out: " + out.gsub(/\s+/, ' ')) unless output_is_sensitive

break if st.success?
raise(ResourceNotFoundError, err) if err.match(ERROR_MATCHERS[:not_found]) && raise_if_not_found

if log_failure
@logger.warn("The following command failed (attempt #{attempt}/#{attempts}): #{Shellwords.join(args)}")
warning = if current_attempt == attempts
"The following command failed (attempt #{current_attempt}/#{attempts})"
elsif retriable_err?(err, retry_whitelist)
"The following command failed and will be retried (attempt #{current_attempt}/#{attempts})"
else
"The following command failed and cannot be retried"
end
@logger.warn("#{warning}: #{Shellwords.join(cmd)}")
@logger.warn(err) unless output_is_sensitive
end

if err.match(NOT_FOUND_ERROR_TEXT)
raise(ResourceNotFoundError, err) if raise_if_not_found
else
@logger.debug("Kubectl err: #{err}") unless output_is_sensitive
StatsD.increment('kubectl.error', 1, tags: { context: @context, namespace: @namespace, cmd: args[1] })
@logger.debug("Kubectl err: #{output_is_sensitive ? '<suppressed sensitive output>' : err}")
end
sleep(retry_delay(attempt)) unless attempt == attempts
StatsD.increment('kubectl.error', 1, tags: { context: @context, namespace: @namespace, cmd: cmd[1] })

break unless retriable_err?(err, retry_whitelist) && current_attempt < attempts
sleep(retry_delay(current_attempt))
end

[out.chomp, err.chomp, st]
end

def retry_delay(attempt)
attempt
# exponential backoff starting at 1s with cap at 16s, offset by up to 0.5s
[2**(attempt - 1), MAX_RETRY_DELAY].min - Random.rand(0.5).round(1)
end

def version_info
Expand All @@ -79,6 +85,23 @@ def server_version

private

def build_command_from_options(args, use_namespace, use_context, output)
cmd = ["kubectl"] + args
cmd.push("--namespace=#{@namespace}") if use_namespace
cmd.push("--context=#{@context}") if use_context
cmd.push("--output=#{output}") if output
cmd.push("--request-timeout=#{@default_timeout}") if @default_timeout
cmd
end

def retriable_err?(err, retry_whitelist)
return !err.match(ERROR_MATCHERS[:not_found]) if retry_whitelist.nil?
retry_whitelist.any? do |retriable|
raise NotImplementedError, "No matcher defined for #{retriable.inspect}" unless ERROR_MATCHERS.key?(retriable)
err.match(ERROR_MATCHERS[retriable])
end
end

def extract_version_info_from_kubectl_response(response)
info = {}
response.each_line do |l|
Expand Down
27 changes: 15 additions & 12 deletions lib/kubernetes-deploy/kubernetes_resource.rb
Original file line number Diff line number Diff line change
Expand Up @@ -122,18 +122,8 @@ def validate_definition(kubectl, selector: nil)
@validation_errors = []
validate_selector(selector) if selector
validate_timeout_annotation

command = ["create", "-f", file_path, "--dry-run", "--output=name"]
_, err, st = kubectl.run(*command, log_failure: false, output_is_sensitive: sensitive_template_content?)
return true if st.success?
if sensitive_template_content?
@validation_errors << <<-EOS
Validation for #{id} failed. Detailed information is unavailable as the raw error may contain sensitive data.
EOS
else
@validation_errors << err
end
false
validate_spec_with_kubectl(kubectl)
@validation_errors.present?
end

def validation_error_msg
Expand Down Expand Up @@ -433,6 +423,19 @@ def validate_selector(selector)
end
end

def validate_spec_with_kubectl(kubectl)
command = ["create", "-f", file_path, "--dry-run", "--output=name"]
_, err, st = kubectl.run(*command, log_failure: false, output_is_sensitive: sensitive_template_content?,
retry_whitelist: [:client_timeout], attempts: 3)

return true if st.success?
@validation_errors << if sensitive_template_content?
"Validation for #{id} failed. Detailed information is unavailable as the raw error may contain sensitive data."
else
err
end
end

def labels
@definition.dig("metadata", "labels")
end
Expand Down
5 changes: 4 additions & 1 deletion test/integration/kubernetes_deploy_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,10 @@ def test_failed_deploy_to_nonexistent_namespace
original_ns = @namespace
@namespace = 'this-certainly-should-not-exist'
assert_deploy_failure(deploy_fixtures("hello-cloud", subset: ['configmap-data.yml']))
assert_logs_match(/Result: FAILURE.*namespaces "this-certainly-should-not-exist" not found/m)
assert_logs_match_all([
"Result: FAILURE",
"Namespace this-certainly-should-not-exist not found",
], in_order: true)
ensure
@namespace = original_ns
end
Expand Down
21 changes: 14 additions & 7 deletions test/unit/kubernetes-deploy/ejson_secret_provisioner_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
class EjsonSecretProvisionerTest < KubernetesDeploy::TestCase
def test_resources_based_on_ejson_file_existence
stub_ejson_keys_get_request
stub_kubectl_response("create", "-f", anything, "--dry-run", "--output=name",
kwargs: { log_failure: false, output_is_sensitive: true }, resp: dummy_secret_hash, json: false).times(3)

stub_dry_run_validation_request.times(3) # there are three secrets in the ejson
assert_empty(build_provisioner(fixture_path('hello-cloud')).resources)
refute_empty(build_provisioner(fixture_path('ejson-cloud')).resources)
end
Expand All @@ -21,9 +19,7 @@ def test_run_with_secrets_file_invalid_json

def test_resource_is_built_correctly
stub_ejson_keys_get_request
stub_kubectl_response("create", "-f", anything, "--dry-run", "--output=name",
kwargs: { log_failure: false, output_is_sensitive: true }, resp: dummy_secret_hash, json: false).times(3)

stub_dry_run_validation_request.times(3) # there are three secrets in the ejson
resources = build_provisioner(fixture_path('ejson-cloud')).resources
refute_empty(resources)

Expand Down Expand Up @@ -96,7 +92,8 @@ def test_run_with_incomplete_secret_spec

def test_proactively_validates_resulting_resources_and_raises_without_logging
stub_ejson_keys_get_request
KubernetesDeploy::Secret.any_instance.expects(:validate_definition).returns(false)
stub_dry_run_validation_request
KubernetesDeploy::Secret.any_instance.expects(:validation_failed?).returns(true)
msg = "Generation of Kubernetes secrets from ejson failed: Resulting resource Secret/catphotoscom failed validation"
assert_raises_message(KubernetesDeploy::EjsonSecretError, msg) do
build_provisioner(fixture_path('ejson-cloud')).resources
Expand All @@ -112,6 +109,16 @@ def stub_ejson_keys_get_request
resp: dummy_ejson_secret)
end

def stub_dry_run_validation_request
stub_kubectl_response("create", "-f", anything, "--dry-run", "--output=name", resp: dummy_secret_hash, json: false,
kwargs: {
log_failure: false,
output_is_sensitive: true,
retry_whitelist: [:client_timeout],
attempts: 3,
})
end

def correct_ejson_key_secret_data
{
fixture_public_key => "fedcc95132e9b399ee1f404364fdbc81bdbe4feb5b8292b061f1022481157d5a",
Expand Down
Loading

0 comments on commit ab599ee

Please sign in to comment.