From 6c78e3790f3d939876b7c4f72ddb74745d44477e Mon Sep 17 00:00:00 2001 From: Luke Meyer Date: Mon, 25 Sep 2017 21:39:15 -0400 Subject: [PATCH] diagnostics: introduce AppCreate --- contrib/completions/bash/oc | 134 +++++++ contrib/completions/zsh/oc | 134 +++++++ docs/man/man1/.files_generated_oc | 2 + docs/man/man1/oc-adm-diagnostics-appcreate.1 | 3 + docs/man/man1/oc-ex-diagnostics-appcreate.1 | 3 + pkg/oc/admin/diagnostics/cluster.go | 12 + pkg/oc/admin/diagnostics/diagnostics.go | 2 + .../diagnostics/cluster/app_create/app.go | 126 +++++++ .../diagnostics/cluster/app_create/main.go | 357 ++++++++++++++++++ .../diagnostics/cluster/app_create/result.go | 50 +++ .../diagnostics/cluster/app_create/route.go | 159 ++++++++ .../diagnostics/cluster/app_create/service.go | 146 +++++++ .../cluster/app_create/setup_cleanup.go | 123 ++++++ .../diagnostics/cluster/master_node.go | 3 +- .../diagnostics/cluster/node_definitions.go | 3 +- .../diagnostics/cluster/registry.go | 3 +- .../diagnostics/cluster/rolebindings.go | 7 +- .../diagnostics/diagnostics/cluster/roles.go | 7 +- .../diagnostics/cluster/route_validation.go | 3 +- .../diagnostics/diagnostics/cluster/router.go | 3 +- .../diagnostics/diagnostics/cluster/util.go | 23 -- .../diagnostics/diagnostics/util/util.go | 22 ++ pkg/oc/admin/project/new_project.go | 16 +- 23 files changed, 1303 insertions(+), 38 deletions(-) create mode 100644 docs/man/man1/oc-adm-diagnostics-appcreate.1 create mode 100644 docs/man/man1/oc-ex-diagnostics-appcreate.1 create mode 100644 pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/app.go create mode 100644 pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/main.go create mode 100644 pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/result.go create mode 100644 pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/route.go create mode 100644 pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/service.go create mode 100644 pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/setup_cleanup.go delete mode 100644 pkg/oc/admin/diagnostics/diagnostics/cluster/util.go diff --git a/contrib/completions/bash/oc b/contrib/completions/bash/oc index 86b07f88450a..4fd426805162 100644 --- a/contrib/completions/bash/oc +++ b/contrib/completions/bash/oc @@ -2464,6 +2464,42 @@ _oc_adm_diagnostics_all() flags+=("--aggregatedlogging-logging-project=") local_nonpersistent_flags+=("--aggregatedlogging-logging-project=") + flags+=("--appcreate-admission-timeout=") + local_nonpersistent_flags+=("--appcreate-admission-timeout=") + flags+=("--appcreate-app-image=") + local_nonpersistent_flags+=("--appcreate-app-image=") + flags+=("--appcreate-app-name=") + local_nonpersistent_flags+=("--appcreate-app-name=") + flags+=("--appcreate-app-port=") + local_nonpersistent_flags+=("--appcreate-app-port=") + flags+=("--appcreate-deploy-timeout=") + local_nonpersistent_flags+=("--appcreate-deploy-timeout=") + flags+=("--appcreate-http-retries=") + local_nonpersistent_flags+=("--appcreate-http-retries=") + flags+=("--appcreate-http-timeout=") + local_nonpersistent_flags+=("--appcreate-http-timeout=") + flags+=("--appcreate-keep-app") + local_nonpersistent_flags+=("--appcreate-keep-app") + flags+=("--appcreate-keep-project") + local_nonpersistent_flags+=("--appcreate-keep-project") + flags+=("--appcreate-node-selector=") + local_nonpersistent_flags+=("--appcreate-node-selector=") + flags+=("--appcreate-project=") + local_nonpersistent_flags+=("--appcreate-project=") + flags+=("--appcreate-project-base=") + local_nonpersistent_flags+=("--appcreate-project-base=") + flags+=("--appcreate-result-dir=") + local_nonpersistent_flags+=("--appcreate-result-dir=") + flags+=("--appcreate-route-host=") + local_nonpersistent_flags+=("--appcreate-route-host=") + flags+=("--appcreate-route-port=") + local_nonpersistent_flags+=("--appcreate-route-port=") + flags+=("--appcreate-skip-route-connect") + local_nonpersistent_flags+=("--appcreate-skip-route-connect") + flags+=("--appcreate-skip-route-test") + local_nonpersistent_flags+=("--appcreate-skip-route-test") + flags+=("--appcreate-skip-service-connect") + local_nonpersistent_flags+=("--appcreate-skip-service-connect") flags+=("--cluster-context=") local_nonpersistent_flags+=("--cluster-context=") flags+=("--config=") @@ -2597,6 +2633,103 @@ _oc_adm_diagnostics_analyzelogs() noun_aliases=() } +_oc_adm_diagnostics_appcreate() +{ + last_command="oc_adm_diagnostics_appcreate" + commands=() + + flags=() + two_word_flags=() + local_nonpersistent_flags=() + flags_with_completion=() + flags_completion=() + + flags+=("--admission-timeout=") + local_nonpersistent_flags+=("--admission-timeout=") + flags+=("--app-image=") + local_nonpersistent_flags+=("--app-image=") + flags+=("--app-name=") + local_nonpersistent_flags+=("--app-name=") + flags+=("--app-port=") + local_nonpersistent_flags+=("--app-port=") + flags+=("--cluster-context=") + local_nonpersistent_flags+=("--cluster-context=") + flags+=("--config=") + flags_with_completion+=("--config") + flags_completion+=("_filedir") + local_nonpersistent_flags+=("--config=") + flags+=("--context=") + local_nonpersistent_flags+=("--context=") + flags+=("--deploy-timeout=") + local_nonpersistent_flags+=("--deploy-timeout=") + flags+=("--diaglevel=") + two_word_flags+=("-l") + local_nonpersistent_flags+=("--diaglevel=") + flags+=("--http-retries=") + local_nonpersistent_flags+=("--http-retries=") + flags+=("--http-timeout=") + local_nonpersistent_flags+=("--http-timeout=") + flags+=("--keep-app") + local_nonpersistent_flags+=("--keep-app") + flags+=("--keep-project") + local_nonpersistent_flags+=("--keep-project") + flags+=("--loglevel=") + local_nonpersistent_flags+=("--loglevel=") + flags+=("--logspec=") + local_nonpersistent_flags+=("--logspec=") + flags+=("--node-selector=") + local_nonpersistent_flags+=("--node-selector=") + flags+=("--prevent-modification") + local_nonpersistent_flags+=("--prevent-modification") + flags+=("--project=") + local_nonpersistent_flags+=("--project=") + flags+=("--project-base=") + local_nonpersistent_flags+=("--project-base=") + flags+=("--result-dir=") + local_nonpersistent_flags+=("--result-dir=") + flags+=("--route-host=") + local_nonpersistent_flags+=("--route-host=") + flags+=("--route-port=") + local_nonpersistent_flags+=("--route-port=") + flags+=("--skip-route-connect") + local_nonpersistent_flags+=("--skip-route-connect") + flags+=("--skip-route-test") + local_nonpersistent_flags+=("--skip-route-test") + flags+=("--skip-service-connect") + local_nonpersistent_flags+=("--skip-service-connect") + flags+=("--v=") + local_nonpersistent_flags+=("--v=") + flags+=("--vmodule=") + local_nonpersistent_flags+=("--vmodule=") + flags+=("--as=") + flags+=("--as-group=") + flags+=("--cache-dir=") + flags+=("--certificate-authority=") + flags_with_completion+=("--certificate-authority") + flags_completion+=("_filedir") + flags+=("--client-certificate=") + flags_with_completion+=("--client-certificate") + flags_completion+=("_filedir") + flags+=("--client-key=") + flags_with_completion+=("--client-key") + flags_completion+=("_filedir") + flags+=("--cluster=") + flags+=("--insecure-skip-tls-verify") + flags+=("--log-flush-frequency=") + flags+=("--match-server-version") + flags+=("--namespace=") + two_word_flags+=("-n") + flags+=("--request-timeout=") + flags+=("--server=") + flags+=("--token=") + flags+=("--user=") + flags+=("--version") + + must_have_one_flag=() + must_have_one_noun=() + noun_aliases=() +} + _oc_adm_diagnostics_clusterregistry() { last_command="oc_adm_diagnostics_clusterregistry" @@ -3608,6 +3741,7 @@ _oc_adm_diagnostics() commands+=("aggregatedlogging") commands+=("all") commands+=("analyzelogs") + commands+=("appcreate") commands+=("clusterregistry") commands+=("clusterrolebindings") commands+=("clusterroles") diff --git a/contrib/completions/zsh/oc b/contrib/completions/zsh/oc index df1dbfa8f7c8..dea46d12801d 100644 --- a/contrib/completions/zsh/oc +++ b/contrib/completions/zsh/oc @@ -2606,6 +2606,42 @@ _oc_adm_diagnostics_all() flags+=("--aggregatedlogging-logging-project=") local_nonpersistent_flags+=("--aggregatedlogging-logging-project=") + flags+=("--appcreate-admission-timeout=") + local_nonpersistent_flags+=("--appcreate-admission-timeout=") + flags+=("--appcreate-app-image=") + local_nonpersistent_flags+=("--appcreate-app-image=") + flags+=("--appcreate-app-name=") + local_nonpersistent_flags+=("--appcreate-app-name=") + flags+=("--appcreate-app-port=") + local_nonpersistent_flags+=("--appcreate-app-port=") + flags+=("--appcreate-deploy-timeout=") + local_nonpersistent_flags+=("--appcreate-deploy-timeout=") + flags+=("--appcreate-http-retries=") + local_nonpersistent_flags+=("--appcreate-http-retries=") + flags+=("--appcreate-http-timeout=") + local_nonpersistent_flags+=("--appcreate-http-timeout=") + flags+=("--appcreate-keep-app") + local_nonpersistent_flags+=("--appcreate-keep-app") + flags+=("--appcreate-keep-project") + local_nonpersistent_flags+=("--appcreate-keep-project") + flags+=("--appcreate-node-selector=") + local_nonpersistent_flags+=("--appcreate-node-selector=") + flags+=("--appcreate-project=") + local_nonpersistent_flags+=("--appcreate-project=") + flags+=("--appcreate-project-base=") + local_nonpersistent_flags+=("--appcreate-project-base=") + flags+=("--appcreate-result-dir=") + local_nonpersistent_flags+=("--appcreate-result-dir=") + flags+=("--appcreate-route-host=") + local_nonpersistent_flags+=("--appcreate-route-host=") + flags+=("--appcreate-route-port=") + local_nonpersistent_flags+=("--appcreate-route-port=") + flags+=("--appcreate-skip-route-connect") + local_nonpersistent_flags+=("--appcreate-skip-route-connect") + flags+=("--appcreate-skip-route-test") + local_nonpersistent_flags+=("--appcreate-skip-route-test") + flags+=("--appcreate-skip-service-connect") + local_nonpersistent_flags+=("--appcreate-skip-service-connect") flags+=("--cluster-context=") local_nonpersistent_flags+=("--cluster-context=") flags+=("--config=") @@ -2739,6 +2775,103 @@ _oc_adm_diagnostics_analyzelogs() noun_aliases=() } +_oc_adm_diagnostics_appcreate() +{ + last_command="oc_adm_diagnostics_appcreate" + commands=() + + flags=() + two_word_flags=() + local_nonpersistent_flags=() + flags_with_completion=() + flags_completion=() + + flags+=("--admission-timeout=") + local_nonpersistent_flags+=("--admission-timeout=") + flags+=("--app-image=") + local_nonpersistent_flags+=("--app-image=") + flags+=("--app-name=") + local_nonpersistent_flags+=("--app-name=") + flags+=("--app-port=") + local_nonpersistent_flags+=("--app-port=") + flags+=("--cluster-context=") + local_nonpersistent_flags+=("--cluster-context=") + flags+=("--config=") + flags_with_completion+=("--config") + flags_completion+=("_filedir") + local_nonpersistent_flags+=("--config=") + flags+=("--context=") + local_nonpersistent_flags+=("--context=") + flags+=("--deploy-timeout=") + local_nonpersistent_flags+=("--deploy-timeout=") + flags+=("--diaglevel=") + two_word_flags+=("-l") + local_nonpersistent_flags+=("--diaglevel=") + flags+=("--http-retries=") + local_nonpersistent_flags+=("--http-retries=") + flags+=("--http-timeout=") + local_nonpersistent_flags+=("--http-timeout=") + flags+=("--keep-app") + local_nonpersistent_flags+=("--keep-app") + flags+=("--keep-project") + local_nonpersistent_flags+=("--keep-project") + flags+=("--loglevel=") + local_nonpersistent_flags+=("--loglevel=") + flags+=("--logspec=") + local_nonpersistent_flags+=("--logspec=") + flags+=("--node-selector=") + local_nonpersistent_flags+=("--node-selector=") + flags+=("--prevent-modification") + local_nonpersistent_flags+=("--prevent-modification") + flags+=("--project=") + local_nonpersistent_flags+=("--project=") + flags+=("--project-base=") + local_nonpersistent_flags+=("--project-base=") + flags+=("--result-dir=") + local_nonpersistent_flags+=("--result-dir=") + flags+=("--route-host=") + local_nonpersistent_flags+=("--route-host=") + flags+=("--route-port=") + local_nonpersistent_flags+=("--route-port=") + flags+=("--skip-route-connect") + local_nonpersistent_flags+=("--skip-route-connect") + flags+=("--skip-route-test") + local_nonpersistent_flags+=("--skip-route-test") + flags+=("--skip-service-connect") + local_nonpersistent_flags+=("--skip-service-connect") + flags+=("--v=") + local_nonpersistent_flags+=("--v=") + flags+=("--vmodule=") + local_nonpersistent_flags+=("--vmodule=") + flags+=("--as=") + flags+=("--as-group=") + flags+=("--cache-dir=") + flags+=("--certificate-authority=") + flags_with_completion+=("--certificate-authority") + flags_completion+=("_filedir") + flags+=("--client-certificate=") + flags_with_completion+=("--client-certificate") + flags_completion+=("_filedir") + flags+=("--client-key=") + flags_with_completion+=("--client-key") + flags_completion+=("_filedir") + flags+=("--cluster=") + flags+=("--insecure-skip-tls-verify") + flags+=("--log-flush-frequency=") + flags+=("--match-server-version") + flags+=("--namespace=") + two_word_flags+=("-n") + flags+=("--request-timeout=") + flags+=("--server=") + flags+=("--token=") + flags+=("--user=") + flags+=("--version") + + must_have_one_flag=() + must_have_one_noun=() + noun_aliases=() +} + _oc_adm_diagnostics_clusterregistry() { last_command="oc_adm_diagnostics_clusterregistry" @@ -3750,6 +3883,7 @@ _oc_adm_diagnostics() commands+=("aggregatedlogging") commands+=("all") commands+=("analyzelogs") + commands+=("appcreate") commands+=("clusterregistry") commands+=("clusterrolebindings") commands+=("clusterroles") diff --git a/docs/man/man1/.files_generated_oc b/docs/man/man1/.files_generated_oc index a30aeb410038..0ec75168cbf2 100644 --- a/docs/man/man1/.files_generated_oc +++ b/docs/man/man1/.files_generated_oc @@ -40,6 +40,7 @@ oc-adm-create-signer-cert.1 oc-adm-diagnostics-aggregatedlogging.1 oc-adm-diagnostics-all.1 oc-adm-diagnostics-analyzelogs.1 +oc-adm-diagnostics-appcreate.1 oc-adm-diagnostics-clusterregistry.1 oc-adm-diagnostics-clusterrolebindings.1 oc-adm-diagnostics-clusterroles.1 @@ -195,6 +196,7 @@ oc-ex-config.1 oc-ex-diagnostics-aggregatedlogging.1 oc-ex-diagnostics-all.1 oc-ex-diagnostics-analyzelogs.1 +oc-ex-diagnostics-appcreate.1 oc-ex-diagnostics-clusterregistry.1 oc-ex-diagnostics-clusterrolebindings.1 oc-ex-diagnostics-clusterroles.1 diff --git a/docs/man/man1/oc-adm-diagnostics-appcreate.1 b/docs/man/man1/oc-adm-diagnostics-appcreate.1 new file mode 100644 index 000000000000..b6fd7a0f9896 --- /dev/null +++ b/docs/man/man1/oc-adm-diagnostics-appcreate.1 @@ -0,0 +1,3 @@ +This file is autogenerated, but we've stopped checking such files into the +repository to reduce the need for rebases. Please run hack/generate-docs.sh to +populate this file. diff --git a/docs/man/man1/oc-ex-diagnostics-appcreate.1 b/docs/man/man1/oc-ex-diagnostics-appcreate.1 new file mode 100644 index 000000000000..b6fd7a0f9896 --- /dev/null +++ b/docs/man/man1/oc-ex-diagnostics-appcreate.1 @@ -0,0 +1,3 @@ +This file is autogenerated, but we've stopped checking such files into the +repository to reduce the need for rebases. Please run hack/generate-docs.sh to +populate this file. diff --git a/pkg/oc/admin/diagnostics/cluster.go b/pkg/oc/admin/diagnostics/cluster.go index 002b7557083b..8ded7eaf40f9 100644 --- a/pkg/oc/admin/diagnostics/cluster.go +++ b/pkg/oc/admin/diagnostics/cluster.go @@ -17,6 +17,7 @@ import ( oauthclient "github.com/openshift/origin/pkg/oauth/generated/internalclientset" clustdiags "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/cluster" agldiags "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/cluster/aggregated_logging" + appcreate "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create" "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/types" osclientcmd "github.com/openshift/origin/pkg/oc/cli/util/clientcmd" projectclient "github.com/openshift/origin/pkg/project/generated/internalclientset" @@ -30,6 +31,7 @@ import ( func availableClusterDiagnostics() types.DiagnosticList { return types.DiagnosticList{ &agldiags.AggregatedLogging{}, + appcreate.NewDefaultAppCreateDiagnostic(), &clustdiags.ClusterRegistry{}, &clustdiags.ClusterRouter{}, &clustdiags.ClusterRoles{}, @@ -96,6 +98,16 @@ func (o DiagnosticsOptions) buildClusterDiagnostics(rawConfig *clientcmdapi.Conf case agldiags.AggregatedLoggingName: p := o.ParameterizedDiagnostics[agldiags.AggregatedLoggingName].(*agldiags.AggregatedLogging).Project d = agldiags.NewAggregatedLogging(p, kclusterClient, oauthClient.Oauth(), projectClient.Project(), routeClient.Route(), oauthorizationClient.Authorization(), appsClient.Apps(), securityClient.Security()) + case appcreate.AppCreateName: + ac := o.ParameterizedDiagnostics[diagnosticName].(*appcreate.AppCreate) + ac.KubeClient = kclusterClient + ac.ProjectClient = projectClient.Project() + ac.RouteClient = routeClient + ac.RoleBindingClient = oauthorizationClient.Authorization() + ac.SARClient = kclusterClient.Authorization() + ac.AppsClient = appsClient + ac.PreventModification = o.PreventModification + d = ac case clustdiags.NodeDefinitionsName: d = &clustdiags.NodeDefinitions{KubeClient: kclusterClient} case clustdiags.MasterNodeName: diff --git a/pkg/oc/admin/diagnostics/diagnostics.go b/pkg/oc/admin/diagnostics/diagnostics.go index f18147b21111..0672cd95ecb0 100644 --- a/pkg/oc/admin/diagnostics/diagnostics.go +++ b/pkg/oc/admin/diagnostics/diagnostics.go @@ -276,6 +276,8 @@ func bindIndividualFlags(diag types.ParameterizedDiagnostic, prefix string, flag flags.StringVar(target, name, param.Default.(string), param.Description) case *int: flags.IntVar(target, name, param.Default.(int), param.Description) + case *int64: + flags.Int64Var(target, name, param.Default.(int64), param.Description) case *bool: flags.BoolVar(target, name, param.Default.(bool), param.Description) default: diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/app.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/app.go new file mode 100644 index 000000000000..5f2eb4bccddd --- /dev/null +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/app.go @@ -0,0 +1,126 @@ +package app_create + +import ( + "fmt" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + kapi "k8s.io/kubernetes/pkg/apis/core" + conditions "k8s.io/kubernetes/pkg/client/unversioned" + + apps "github.com/openshift/origin/pkg/apps/apis/apps" +) + +func (d *AppCreate) createAndCheckAppDC() bool { + result := &d.result.App + result.BeginTime = jsonTime(time.Now()) + defer recordTrial(result) + if !d.createAppDC() { + return false + } + result.Success = d.checkPodRunning() + return result.Success +} + +// create the DC +func (d *AppCreate) createAppDC() bool { + defer recordTime(&d.result.App.CreatedTime) + gracePeriod := int64(0) + dc := &apps.DeploymentConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: d.appName, + Labels: d.label, + }, + Spec: apps.DeploymentConfigSpec{ + Replicas: 1, + Selector: d.label, + Triggers: []apps.DeploymentTriggerPolicy{ + {Type: apps.DeploymentTriggerOnConfigChange}, + }, + Template: &kapi.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: d.label}, + Spec: kapi.PodSpec{ + TerminationGracePeriodSeconds: &gracePeriod, + Containers: []kapi.Container{ + { + Name: d.appName, + Image: d.appImage, + Ports: []kapi.ContainerPort{ + { + Name: "http", + ContainerPort: int32(d.appPort), + Protocol: kapi.ProtocolTCP, + }, + }, + ImagePullPolicy: kapi.PullIfNotPresent, + Command: []string{ + "socat", "-T", "1", "-d", + fmt.Sprintf("%s-l:%d,reuseaddr,fork,crlf", kapi.ProtocolTCP, d.appPort), + "system:\"echo 'HTTP/1.0 200 OK'; echo 'Content-Type: text/plain'; echo; echo 'Hello'\"", + }, + ReadinessProbe: &kapi.Probe{ + // The action taken to determine the health of a container + Handler: kapi.Handler{ + HTTPGet: &kapi.HTTPGetAction{ + Path: "/", + Port: intstr.FromInt(d.appPort), + }, + }, + InitialDelaySeconds: 0, + TimeoutSeconds: 1, + PeriodSeconds: 1, + }, + }, + }, + }, + }, + }, + } + + if _, err := d.AppsClient.Apps().DeploymentConfigs(d.project).Create(dc); err != nil { + d.out.Error("DCluAC006", err, fmt.Sprintf("%s: Creating deploymentconfig '%s' failed:\n%v", now(), d.appName, err)) + return false + } + return true +} + +// wait for a pod to become active +func (d *AppCreate) checkPodRunning() bool { + defer recordTime(&d.result.App.ReadyTime) + d.out.Debug("DCluAC007", fmt.Sprintf("%s: Waiting %ds for pod to reach running state.", now(), d.deployTimeout)) + watcher, err := d.KubeClient.Core().Pods(d.project).Watch(metav1.ListOptions{LabelSelector: d.labelSelector, TimeoutSeconds: &d.deployTimeout}) + if err != nil { + d.out.Error("DCluAC008", err, fmt.Sprintf(` +%s: Failed to establish a watch for '%s' to deploy a pod: + %v +This may be a transient error. Check the master API logs for anomalies near this time. + `, now(), d.appName, err)) + return false + } + defer stopWatcher(watcher) + for event := range watcher.ResultChan() { + running, err := conditions.PodContainerRunning(d.appName)(event) + if err != nil { + d.out.Error("DCluAC009", err, fmt.Sprintf(` +%s: Error while watching for app pod to deploy: + %v +This may be a transient error. Check the master API logs for anomalies near this time. + `, now(), err)) + return false + } + if running { + d.out.Info("DCluAC010", fmt.Sprintf("%s: App '%s' is running", now(), d.appName)) + return true + } + } + d.out.Error("DCluAC011", nil, fmt.Sprintf(` +%s: App pod was not in running state before timeout (%d sec) +There are many reasons why this can occur; for example: + * The app or deployer image may not be available (check pod status) + * Downloading an image may have timed out (consider increasing timeout) + * The scheduler may be unable to find an appropriate node for it to run (check deployer logs) + * The node container runtime may be malfunctioning (check node and docker/cri-o logs) + `, now(), d.deployTimeout)) + return false +} diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/main.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/main.go new file mode 100644 index 000000000000..2fbc8262cd36 --- /dev/null +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/main.go @@ -0,0 +1,357 @@ +package app_create + +import ( + "fmt" + "net/http" + "os" + "os/signal" + "syscall" + "time" + + kvalidation "k8s.io/apimachinery/pkg/util/validation" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/apiserver/pkg/storage/names" + "k8s.io/kubernetes/pkg/apis/authorization" + kapi "k8s.io/kubernetes/pkg/apis/core" + kclientset "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset" + authorizationtypedclient "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset/typed/authorization/internalversion" + + appsclient "github.com/openshift/origin/pkg/apps/generated/internalclientset" + oauthorizationtypedclient "github.com/openshift/origin/pkg/authorization/generated/internalclientset/typed/authorization/internalversion" + "github.com/openshift/origin/pkg/cmd/util/variable" + "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/log" + "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/types" + "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/util" + osclientcmd "github.com/openshift/origin/pkg/oc/cli/util/clientcmd" + projectclient "github.com/openshift/origin/pkg/project/generated/internalclientset/typed/project/internalversion" + routeclient "github.com/openshift/origin/pkg/route/generated/internalclientset" +) + +// AppCreate is a Diagnostic to create an application and test that it deploys correctly. +type AppCreate struct { + PreventModification bool + KubeClient kclientset.Interface + ProjectClient projectclient.ProjectInterface + RouteClient *routeclient.Clientset + RoleBindingClient oauthorizationtypedclient.RoleBindingsGetter + AppsClient *appsclient.Clientset + SARClient authorizationtypedclient.SelfSubjectAccessReviewsGetter + Factory *osclientcmd.Factory + + // from parameters specific to this diagnostic: + // specs for the project where the diagnostic will put all test items + nodeSelector string + project string + projectBase string + keepProject bool + // run a build and deploy the result if successful + checkBuild bool + keepBuild bool + // deploy an app, service, and route + appName string + appImage string + appPort int + deployTimeout int64 + keepApp bool + routeHost string + routePort int + routeAdmissionTimeout int64 + // connection testing parameters + httpTimeout int64 + httpRetries int + skipServiceConnect bool // service SDN may not be visible from client + skipRouteTest bool // may not expect acceptance (e.g. router may not be running) + skipRouteConnect bool // DNS/network may not be as expected for client to connect to route + // misc + writeResultDir string + label map[string]string // for selecting components later + labelSelector string // for selecting components later + + // diagnostic state + out types.DiagnosticResult + result appCreateResult +} + +// create/tests results and timings +type appCreateResult struct { + BeginTime jsonTime `json:"beginTime"` // time when diagnostic begins + PrepDuration jsonDuration `json:"prepDuration"` // time required to prepare project for app creation + EndTime jsonTime `json:"endTime"` // time when all tests completed + TotalDuration jsonDuration `json:"totalDuration"` // interval between BeginTime and EndTime + Success bool `json:"success"` // overallresult + + App appCreateComponentResult `json:"app"` + Service appCreateComponentResult `json:"service"` + Route appCreateComponentResult `json:"route"` +} + +type appCreateComponentResult struct { + BeginTime jsonTime `json:"beginTime"` // begin time for create/test of this component + CreatedTime jsonTime `json:"createdTime"` // time component creation completed (or failed) + CreatedDuration jsonDuration `json:"createdDuration"` + ReadyTime jsonTime `json:"readyTime"` // time at which component is considered ready (or failed) + ReadyDuration jsonDuration `json:"readyDuration"` // interval between created and ready + TestTime jsonTime `json:"testTime"` // time at which test is considered succeeded/failed + TestDuration jsonDuration `json:"testDuration"` // interval between ready and test success/failure + EndTime jsonTime `json:"endTime"` // time when component create/test completed + TotalDuration jsonDuration `json:"totalDuration"` // interval between BeginTime and EndTime + Required bool `json:"required"` // was component actually required so result counts + Success bool `json:"success"` // overall component result (if required at all) +} + +// using this type to have duration reported as null or seconds. +type jsonDuration int64 + +func (d jsonDuration) MarshalJSON() ([]byte, error) { + duration := time.Duration(d) + encoding := "null" + if duration != 0 { + encoding = fmt.Sprintf("%f", duration.Seconds()) + } + return []byte(encoding), nil +} + +func (d jsonDuration) String() string { + return time.Duration(d).String() +} + +// using this type to have time reported as null when not set +type jsonTime time.Time + +func (t jsonTime) MarshalJSON() ([]byte, error) { + it := time.Time(t) + if it.IsZero() { + return []byte("null"), nil + } + return it.MarshalJSON() +} +func (t jsonTime) IsZero() bool { + return time.Time(t).IsZero() +} +func (t jsonTime) Sub(sub jsonTime) jsonDuration { + return jsonDuration(time.Time(t).Sub(time.Time(sub))) +} + +const ( + AppCreateName = "AppCreate" + + AppCreateProjectBaseDefault = "openshift-diagnostic-appcreate-" + AppCreateAppNameDefault = "diagnostic-appcreate" + AppCreateAppPortDefault = 8080 + AppCreateTimeoutDefault int64 = 120 + AppCreateHttpTimeoutDefault int64 = 500 + AppCreateHttpRetriesDefault = 10 + AppCreateRouteAdmissionTimeoutDefault int64 = 10 +) + +func (d *AppCreate) Name() string { + return AppCreateName +} + +func (d *AppCreate) Description() string { + return "Create an application and test that it deploys correctly." +} + +func (d *AppCreate) Requirements() (client bool, host bool) { + return true, false +} + +func NewDefaultAppCreateDiagnostic() *AppCreate { + return &AppCreate{ + projectBase: AppCreateProjectBaseDefault, + checkBuild: true, + appName: AppCreateAppNameDefault, + appImage: getDefaultAppImage(), + appPort: AppCreateAppPortDefault, + httpTimeout: AppCreateHttpTimeoutDefault, + httpRetries: AppCreateHttpRetriesDefault, + } +} + +func (d *AppCreate) AvailableParameters() []types.Parameter { + return []types.Parameter{ + {"project", "Project name to use instead of generating from project-base", &d.project, ""}, + {"project-base", "Base name to create randomized project name", &d.projectBase, AppCreateProjectBaseDefault}, + {"keep-project", "Do not delete randomized project when complete", &d.keepProject, false}, + {"app-name", "Name for the test application to be created", &d.appName, AppCreateAppNameDefault}, + {"app-image", "Image for the test application to be created", &d.appImage, getDefaultAppImage()}, + {"app-port", "Port at which the test application listens", &d.appPort, AppCreateAppPortDefault}, + {"route-host", "Create specific route instead of default", &d.routeHost, ""}, + {"route-port", "Router port to use for route connection test", &d.routePort, 80}, + {"deploy-timeout", "Seconds to wait for the app to be ready", &d.deployTimeout, AppCreateTimeoutDefault}, + {"admission-timeout", "Seconds to wait for the route to be admitted by a router", &d.routeAdmissionTimeout, AppCreateRouteAdmissionTimeoutDefault}, + {"skip-service-connect", "Do not test connecting to the service", &d.skipServiceConnect, false}, + {"skip-route-test", "Do not test route at all", &d.skipRouteTest, false}, + {"skip-route-connect", "Do not test connecting to the route", &d.skipRouteConnect, false}, + {"http-timeout", "Milliseconds to wait for an HTTP request to the app", &d.httpTimeout, AppCreateHttpTimeoutDefault}, + {"http-retries", "Number of times to retry an HTTP request to the app", &d.httpRetries, AppCreateHttpRetriesDefault}, + {"node-selector", "Node selector for where the test app should land", &d.nodeSelector, ""}, + {"keep-app", "Do not delete the test app when complete", &d.keepApp, false}, + {"result-dir", "Directory in which to write result details if desired", &d.writeResultDir, ""}, + } +} + +func getDefaultAppImage() string { + template := variable.NewDefaultImageTemplate() + return template.ExpandOrDie("deployer") +} + +func (d *AppCreate) Complete(logger *log.Logger) error { + // project management + d.keepProject = d.keepProject || d.keepApp // don't delete project if keeping app + if d.project == "" && d.projectBase == "" { + return fmt.Errorf("%s project name cannot be empty", AppCreateName) + } + if d.project == "" { + // generate a project if not specified + d.project = names.SimpleNameGenerator.GenerateName(d.projectBase) + } else { + // when an existing project is specified, deleting it is likely to surprise the user, so don't + d.keepProject = true + } + if errs := kvalidation.IsDNS1123Label(d.project); len(errs) > 0 { + return fmt.Errorf("invalid project name '%s' for AppCreate: %v", d.project, errs) + } + // TODO: also test that route is valid under DNS952 + + // app management + if d.appName == "" { + return fmt.Errorf("%s app name cannot be empty", AppCreateName) + } + if errs := kvalidation.IsDNS1123Label(d.appName); len(errs) > 0 { + return fmt.Errorf("invalid app name '%s' for AppCreate: %v", d.appName, errs) + } + if err := kvalidation.IsValidPortNum(d.appPort); err != nil { + return fmt.Errorf("invalid app port %d for AppCreate: %v", d.appPort, err) + } + d.label = map[string]string{"app": d.appName} + d.labelSelector = fmt.Sprintf("app=%s", d.appName) + + d.skipRouteConnect = d.skipRouteConnect || d.skipRouteTest // don't try to connect to route if skipping route test + + return nil +} + +func (d *AppCreate) CanRun() (bool, error) { + if d.SARClient == nil || d.AppsClient == nil || d.KubeClient == nil || d.ProjectClient == nil || d.RoleBindingClient == nil { + return false, fmt.Errorf("missing at least one client") + } + if d.PreventModification { + return false, fmt.Errorf("requires modifications: create a project and application") + } + return util.UserCan(d.SARClient, &authorization.ResourceAttributes{ + Verb: "create", + Group: kapi.GroupName, + Resource: "namespace", + }) +} + +func (d *AppCreate) Check() types.DiagnosticResult { + d.out = types.NewDiagnosticResult(AppCreateName) + done := make(chan bool, 1) + + // Jump straight to clean up if there is an interrupt/terminate signal while running diagnostic + sig := make(chan os.Signal, 1) + signal.Notify(sig, os.Interrupt, syscall.SIGTERM) + go func() { + <-sig + d.out.Warn("DCluAC001", nil, "Received interrupt; aborting diagnostic") + done <- true + }() + + // The actual diagnostic logic + go func() { + d.result.BeginTime = jsonTime(time.Now()) + defer func() { + d.result.EndTime = jsonTime(time.Now()) + d.result.TotalDuration = d.result.EndTime.Sub(d.result.BeginTime) + done <- true + }() + if !d.prepareForApp() || !d.createAndCheckAppDC() || !d.createAndCheckService() { + return // without success + // NOTE: even if we won't try to connect, we still create the service to test for endpoints + } + if d.skipRouteTest { + d.out.Debug("DCluAC002", "skipping route creation and testing as requested") + // however if we just skip connection testing we still create and test for admission + } else { + d.createAndCheckRoute() + } + d.result.Success = allSucceeded(&d.result.App, &d.result.Service, &d.result.Route) + }() + + <-done // wait until either finishes + signal.Stop(sig) + d.logResult() + d.cleanup() + return d.out +} + +func allSucceeded(components ...*appCreateComponentResult) bool { + for _, comp := range components { + if comp.Required && !comp.Success { + return false + } + } + return true +} + +func now() string { + return time.Now().Format("15:04:05.999") +} + +func recordTime(at *jsonTime) { + *at = jsonTime(time.Now()) +} + +func recordTrial(result *appCreateComponentResult) { + result.EndTime = jsonTime(time.Now()) + result.TotalDuration = result.EndTime.Sub(result.BeginTime) + result.Required = true + if result.CreatedTime.IsZero() { + return + } + result.CreatedDuration = result.CreatedTime.Sub(result.BeginTime) + if result.ReadyTime.IsZero() { + return + } + result.ReadyDuration = result.ReadyTime.Sub(result.CreatedTime) + if result.TestTime.IsZero() { + return + } + result.TestDuration = result.TestTime.Sub(result.ReadyTime) +} + +func stopWatcher(watcher watch.Interface) { + watcher.Stop() +} + +func (d *AppCreate) checkHttp(url string, timeout int64, retry int) error { + timeoutDuration := time.Millisecond * time.Duration(timeout) + client := &http.Client{Timeout: timeoutDuration} + var requestErr error = nil + start := time.Now() + for try := 0; try <= retry; try++ { + if requestErr != nil { // wait to retry if quick response in previous try + time.Sleep(timeoutDuration - time.Since(start)) + } + start = time.Now() + d.out.Debug("DCluAC032", fmt.Sprintf("%s: waiting %dms for an HTTP response from %s", now(), timeout, url)) + response, err := client.Get(url) + respondedTime := time.Since(start) + if err != nil { + d.out.Debug("DCluAC033", fmt.Sprintf("%s: Request to %s returned an error or timed out in %v: %v", now(), url, respondedTime, err)) + requestErr = err + continue + } + response.Body.Close() + if response.StatusCode != 200 { + requestErr = fmt.Errorf("Saw HTTP response %d", response.StatusCode) + d.out.Debug("DCluAC034", fmt.Sprintf("%s: Request to %s returned non-200 status code after %v: %v", now(), url, respondedTime, requestErr)) + continue + } + d.out.Debug("DCluAC035", fmt.Sprintf("%s: Completed HTTP request to %s successfully in %v on try #%d", now(), url, respondedTime, try)) + return nil + } + return requestErr +} diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/result.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/result.go new file mode 100644 index 000000000000..f0b7b42c0a7c --- /dev/null +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/result.go @@ -0,0 +1,50 @@ +package app_create + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" +) + +func (d *AppCreate) logResult() { + // start with some Info logs to the user + if d.result.App.Required { + d.out.Info("DCluAC48", fmt.Sprintf("App creation and readiness completed with success=%t in %v", d.result.App.Success, d.result.App.TotalDuration)) + } + if d.result.Service.Required { + d.out.Info("DCluAC49", fmt.Sprintf("Service creation and testing completed with success=%t in %v", d.result.Service.Success, d.result.Service.TotalDuration)) + } + if d.result.Route.Required { + d.out.Info("DCluAC50", fmt.Sprintf("Route creation and testing completed with success=%t in %v", d.result.Route.Success, d.result.Route.TotalDuration)) + } + d.out.Info("DCluAC51", fmt.Sprintf("Entire create/test completed with success=%t in %v", d.result.Success, d.result.TotalDuration)) + + // check whether results are supposed to be written to disk at all + if d.writeResultDir != "" { + // create the write directory if needed + if err := os.MkdirAll(d.writeResultDir, os.ModePerm); err != nil { + d.out.Warn("DCluAC036", err, fmt.Sprintf("Could not create debug output directory '%s': %v", d.writeResultDir, err)) + return + } + } else { + d.out.Debug("DCluAC037", "No output directory specified; results will not be written to files.") + return + } + + // write the result struct itself + filename := filepath.Join(d.writeResultDir, "result.json") + file, err := os.Create(filename) + if err != nil { + d.out.Warn("DCluAC038", err, fmt.Sprintf("Could not create result output file '%s': %v", filename, err)) + return + } + defer file.Close() + encoder := json.NewEncoder(file) + encoder.SetIndent("", " ") + err = encoder.Encode(d.result) + if err != nil { + d.out.Warn("DCluAC039", err, fmt.Sprintf("Could not write results to output file '%s': %v", filename, err)) + return + } +} diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/route.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/route.go new file mode 100644 index 000000000000..3b75556d0c15 --- /dev/null +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/route.go @@ -0,0 +1,159 @@ +package app_create + +import ( + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/watch" + + route "github.com/openshift/origin/pkg/route/apis/route" +) + +func (d *AppCreate) createAndCheckRoute() { + result := &d.result.Route + result.BeginTime = jsonTime(time.Now()) + defer recordTrial(result) + if !d.createRoute() { + return + } + if d.skipRouteTest { + d.out.Debug("DCluAC021", "skipping route test as requested") + result.Success = true + return + } + result.Success = d.checkRoute() +} + +// create the route for the service +func (d *AppCreate) createRoute() bool { + defer recordTime(&d.result.Route.CreatedTime) + appRoute := &route.Route{ + ObjectMeta: metav1.ObjectMeta{Name: d.appName, Labels: d.label}, + Spec: route.RouteSpec{ + Host: d.routeHost, + To: route.RouteTargetReference{ + Kind: "Service", + Name: d.appName, + Weight: nil, + }, + }, + } + if _, err := d.RouteClient.Route().Routes(d.project).Create(appRoute); err != nil { + d.out.Error("DCluAC022", err, fmt.Sprintf("%s: Creating route '%s' failed:\n%v", now(), d.appName, err)) + return false + } + return true +} + +// check that the route is admitted and can be reached +func (d *AppCreate) checkRoute() bool { + + if !d.checkRouteAdmitted() { + return false + } + + if d.skipRouteConnect { + d.out.Debug("DCluAC023", "skipping route connection test as requested") + return true + } + return d.checkRouteConnection() +} + +func (d *AppCreate) checkRouteAdmitted() bool { + defer recordTime(&d.result.Route.ReadyTime) + + // set up a watch for route admission + d.out.Debug("DCluAC024", fmt.Sprintf("%s: Waiting for route to be admitted by a router", now())) + watcher, err := d.RouteClient.Route().Routes(d.project).Watch(metav1.ListOptions{FieldSelector: "metadata.name=" + d.appName, TimeoutSeconds: &d.routeAdmissionTimeout}) + if err != nil { + d.out.Error("DCluAC025", err, fmt.Sprintf(` +%s: Failed to establish a watch for '%s' route to be admitted by a router: + %v +This may be a transient error. Check the master API logs for anomalies near this time. + `, now(), d.appName, err)) + return false + } + defer stopWatcher(watcher) + + // test for the result of the watch + for event := range watcher.ResultChan() { + ready, err := isRouteAdmitted(event) + if err != nil { + d.out.Error("DCluAC026", err, fmt.Sprintf(` +%s: Error while watching for route to be admitted by a router: + %v +This may be a transient error. Check the master API logs for anomalies near this time. + `, now(), err)) + return false + } + if ready { + d.out.Debug("DCluAC027", fmt.Sprintf("%s: Route has been admitted by a router", now())) + return true + } + } + d.out.Error("DCluAC028", nil, fmt.Sprintf(` +%s: Route was not admitted by a router before timeout (%d sec) +Diagnostics waited for the '%s' route to be admitted by a router (making the +application available via that route) after the test app started running. +However, this did not occur within the timeout. +Some of the reasons why this may fail include: + * There is no router running to accept routes + * The available router(s) are configured not to accept the route + * The router simply needs longer to admit the route (you can increase the timeout) + * The app stopped responding or was killed + `, now(), d.routeAdmissionTimeout, d.appName)) + return false +} + +func isRouteAdmitted(event watch.Event) (bool, error) { + switch event.Type { + case watch.Deleted: + return false, errors.NewNotFound(schema.GroupResource{Resource: "routes"}, "") + } + switch r := event.Object.(type) { + case *route.Route: + for _, ingress := range r.Status.Ingress { + for _, cond := range ingress.Conditions { + if cond.Type == route.RouteAdmitted && cond.Status == "True" { + return true, nil + } + } + } + return false, nil + } + return false, nil +} + +// check that we get the expected HTTP response from the route +func (d *AppCreate) checkRouteConnection() bool { + defer recordTime(&d.result.Route.TestTime) + r, err := d.RouteClient.Route().Routes(d.project).Get(d.appName, metav1.GetOptions{}) + if err != nil { + d.out.Error("DCluAC029", err, fmt.Sprintf("%s: Error retrieving %s route: %v", now(), d.appName, err)) + return false + } + url := fmt.Sprintf("http://%s:%d/", r.Spec.Host, d.routePort) + if err := d.checkHttp(url, d.httpTimeout, d.httpRetries); err != nil { + d.out.Error("DCluAC030", err, fmt.Sprintf(` +%s: Request to route %s with timeout %dms failed after %d tries. + Last error was: %v +Diagnostics attempted to connect to the admitted route for the test application, +expecting to receive a successful response with HTTP code 200. This did not happen +within the given timeout. +Some of the reasons why this may fail include: + * The host running this diagnostic is not configured to resolve the route host via DNS + (try running from a different host, or skip the route connection test) + * The router has not yet started routing the route's host after admitting it + (try increasing the diagnostic timeout or number of retries) + * The pod stopped or was killed after starting successfully (check pod/node logs) + * The pod is responding with a non-200 HTTP code (or, not quickly enough / at all) + * Cluster networking problems prevent the router from connecting to the service + `, now(), url, d.httpTimeout, d.httpRetries+1, err)) + return false + } + d.out.Info("DCluAC031", fmt.Sprintf("%s: Request to route %s succeeded", now(), url)) + return true +} diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/service.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/service.go new file mode 100644 index 000000000000..21dc6e29fc5d --- /dev/null +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/service.go @@ -0,0 +1,146 @@ +package app_create + +import ( + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/watch" + kapi "k8s.io/kubernetes/pkg/apis/core" +) + +const ( + serviceEndpointTimeout = 10 // seconds; failsafe to wait for endpoint to appear, which is normally instantaneous. +) + +func (d *AppCreate) createAndCheckService() bool { + result := &d.result.Service + result.BeginTime = jsonTime(time.Now()) + defer recordTrial(result) + if !d.createService() || !d.checkServiceEndpoint() { + return false + } + if d.skipServiceConnect { + d.out.Debug("DCluAC012", "skipping service connection test as requested") + result.Success = true + } else { + result.Success = d.checkServiceConnect() + } + return result.Success +} + +// create the service for the app +func (d *AppCreate) createService() bool { + defer recordTime(&d.result.Service.CreatedTime) + service := &kapi.Service{ + ObjectMeta: metav1.ObjectMeta{Name: d.appName, Labels: d.label}, + Spec: kapi.ServiceSpec{ + Type: kapi.ServiceTypeClusterIP, + Selector: d.label, + Ports: []kapi.ServicePort{ + { + Protocol: kapi.ProtocolTCP, + Port: 8080, + TargetPort: intstr.FromInt(d.appPort), + }, + }, + }, + } + if _, err := d.KubeClient.Core().Services(d.project).Create(service); err != nil { + d.out.Error("DCluAC013", err, fmt.Sprintf("%s: Creating service '%s' failed:\n%v", now(), d.appName, err)) + return false + } + return true +} + +// wait for the service to establish endpoints +func (d *AppCreate) checkServiceEndpoint() bool { + defer recordTime(&d.result.Service.ReadyTime) + + // set up a watcher for endpoints on the service + timeout := int64(serviceEndpointTimeout) + d.out.Debug("DCluAC014", fmt.Sprintf("%s: Waiting for service to establish endpoints", now())) + watcher, err := d.KubeClient.Core().Endpoints(d.project).Watch(metav1.ListOptions{FieldSelector: "metadata.name=" + d.appName, TimeoutSeconds: &timeout}) + if err != nil { + d.out.Error("DCluAC015", err, fmt.Sprintf(` +%s: Failed to establish a watch for '%s' service to be ready: + %v +This may be a transient error. Check the master API logs for anomalies near this time. + `, now(), d.appName, err)) + return false + } + defer stopWatcher(watcher) + + // and wait for the results of the watch + for event := range watcher.ResultChan() { + ready, err := doesServiceHaveEndpoint(event) + if err != nil { + d.out.Error("DCluAC016", err, fmt.Sprintf(` +%s: Error while watching for service endpoint: + %v +This may be a transient error. Check the master API logs for anomalies near this time. + `, now(), err)) + return false + } + if ready { + d.out.Debug("DCluAC017", fmt.Sprintf("%s: Service has endpoint", now())) + return true + } + } + d.out.Error("DCluAC018", nil, fmt.Sprintf(` +%s: Service did not find endpoint before timeout (%d sec) +This is very unusual after the app has a running pod; it should be investigated. + `, now(), serviceEndpointTimeout)) + return false +} + +// Returns false until the service has at least one endpoint. +// Will return an error if the service is deleted or any other error occurs. +func doesServiceHaveEndpoint(event watch.Event) (bool, error) { + switch event.Type { + case watch.Deleted: + return false, errors.NewNotFound(schema.GroupResource{Resource: "services"}, "") + } + switch ep := event.Object.(type) { + case *kapi.Endpoints: + ss := ep.Subsets + if len(ss) == 0 || len(ss[0].Addresses) < 1 { + return false, nil + } + return true, nil + } + return false, nil +} + +// check we can actually get a response from the service +func (d *AppCreate) checkServiceConnect() bool { + defer recordTime(&d.result.Service.TestTime) + service, err := d.KubeClient.Core().Services(d.project).Get(d.appName, metav1.GetOptions{}) + if err != nil { + d.out.Error("DCluAC018", err, fmt.Sprintf("%s: Error retrieving %s service: %v", now(), d.appName, err)) + return false + } + + url := fmt.Sprintf("http://%s:8080/", service.Spec.ClusterIP) + if err := d.checkHttp(url, d.httpTimeout, d.httpRetries); err != nil { + d.out.Error("DCluAC019", err, fmt.Sprintf(` +%s: Request to service %s with timeout %dms failed after %d tries. + Last error was: %v +Diagnostics attempted to connect to the service address for the test application, +expecting to receive a successful response with HTTP code 200. This did not happen +within the given timeout. +Some of the reasons why this may fail include: + * The host running this diagnostic is not part of the cluster SDN + (try running from a master, or skip the service connection test) + * The pod stopped or was killed after starting successfully (check pod/node logs) + * The pod is responding with a non-200 HTTP code (or, not quickly enough / at all) + * Cluster networking problems prevent connecting to the service + `, now(), url, d.httpTimeout, d.httpRetries+1, err)) + return false + } + d.out.Info("DCluAC020", fmt.Sprintf("%s: Request to service address %s succeeded", now(), url)) + return true +} diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/setup_cleanup.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/setup_cleanup.go new file mode 100644 index 000000000000..b5f9c20679fb --- /dev/null +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/app_create/setup_cleanup.go @@ -0,0 +1,123 @@ +package app_create + +import ( + "bytes" + "fmt" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/openshift/origin/pkg/cmd/server/bootstrappolicy" + newproject "github.com/openshift/origin/pkg/oc/admin/project" + appscmd "github.com/openshift/origin/pkg/oc/cli/deploymentconfigs" +) + +const podGoneTimeout = 30 // seconds to wait for previous app pods to disappear + +func (d *AppCreate) prepareForApp() bool { + defer func() { + d.result.PrepDuration = jsonDuration(time.Since(time.Time(d.result.BeginTime))) + }() + if !d.setupProject() { + return false + } + + // delete any pieces of the app left over from a previous run so they don't get in the way + d.cleanupApp() + // ensure that the previous app is gone before creating again + if !d.waitForPodGone() { + return false + } + + return true +} + +func (d *AppCreate) setupProject() bool { + d.out.Info("DCluAC003", fmt.Sprintf("%s: Using project '%s' for diagnostic.", now(), d.project)) + if existing, err := d.KubeClient.Core().Namespaces().Get(d.project, metav1.GetOptions{}); existing != nil && err == nil { + d.out.Debug("DCluAC004", fmt.Sprintf("%s: Project '%s' already exists.", now(), d.project)) + return true + } + + buffer := bytes.Buffer{} + projOpts := &newproject.NewProjectOptions{ + ProjectName: d.project, + DisplayName: "AppCreate diagnostic", + Description: "AppCreate diagnostic", + NodeSelector: d.nodeSelector, + ProjectClient: d.ProjectClient, + RoleBindingClient: d.RoleBindingClient, + AdminRole: bootstrappolicy.AdminRoleName, + AdminUser: "", + Output: &buffer, + } + if err := projOpts.Run(true); err != nil { + d.out.Error("DCluAC005", err, fmt.Sprintf("%s: Creating project '%s' failed: \n%s\n%v", now(), d.project, buffer.String(), err)) + return false + } + + return true +} + +func (d *AppCreate) cleanup() { + if !d.keepApp { + d.cleanupApp() + } + if !d.keepProject { + d.out.Debug("DCluAC041", fmt.Sprintf("%s: Deleting project '%s'.", now(), d.project)) + if err := d.KubeClient.Core().Namespaces().Delete(d.project, nil); err != nil { + d.out.Warn("DCluAC042", err, fmt.Sprintf("%s: Deleting project '%s' failed: %v", now(), d.project, err)) + } else { + return + } + } +} + +// delete all the app components. Errors are listed in debug and ignored, as it is normal for these components +// not to exist and thus lead to an error on delete. If it turns out that other errors occur that we actually +// care about then this can be refined. +func (d *AppCreate) cleanupApp() { + errs := []error{} + d.out.Debug("DCluAC043", fmt.Sprintf("%s: Deleting components of app '%s' if present.", now(), d.appName)) + + // reap the DC's deployments first + if err := appscmd.NewDeploymentConfigReaper(d.AppsClient, d.KubeClient).Stop(d.project, d.appName, time.Duration(1)*time.Second, nil); err != nil { + errs = append(errs, err) + } + + // then delete the DC, service, and route + if err := d.AppsClient.Apps().DeploymentConfigs(d.project).Delete(d.appName, nil); err != nil { + errs = append(errs, err) + } + if err := d.KubeClient.Core().Services(d.project).Delete(d.appName, nil); err != nil { + errs = append(errs, err) + } + if err := d.RouteClient.Route().Routes(d.project).Delete(d.appName, nil); err != nil { + errs = append(errs, err) + } + + if len(errs) > 0 { + d.out.Debug("DCluAC044", fmt.Sprintf("%s: Deleting components of app '%s' failed: %v", now(), d.appName, errs)) + } +} + +func (d *AppCreate) waitForPodGone() bool { + d.out.Debug("DCluAC045", fmt.Sprintf("%s: Waiting to ensure any previous pod for '%s' is gone.", now(), d.appName)) + err := wait.PollImmediate(time.Second, time.Duration(podGoneTimeout)*time.Second, func() (bool, error) { + pods, err := d.KubeClient.Core().Pods(d.project).List(metav1.ListOptions{LabelSelector: d.labelSelector}) + if err == nil && len(pods.Items) == 0 { + return true, nil + } + return false, err + }) + switch err { + case nil: + return true + case wait.ErrWaitTimeout: + d.out.Error("DCluAC046", err, fmt.Sprintf("%s: Previous app pod still present after %ds", now(), podGoneTimeout)) + default: + d.out.Error("DCluAC047", err, fmt.Sprintf("%s: Error while checking for previous app pod:\n%v", now(), err)) + } + return false +} diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/master_node.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/master_node.go index 08605a016d3b..f6c2c36294d7 100644 --- a/pkg/oc/admin/diagnostics/diagnostics/cluster/master_node.go +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/master_node.go @@ -15,6 +15,7 @@ import ( configapilatest "github.com/openshift/origin/pkg/cmd/server/apis/config/latest" "github.com/openshift/origin/pkg/network" "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/types" + "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/util" ) const masterNotRunningAsANode = `Unable to find a node matching the cluster server IP. @@ -68,7 +69,7 @@ func (d *MasterNode) CanRun() (bool, error) { } } - can, err := userCan(d.KubeClient.Authorization(), &authorization.ResourceAttributes{ + can, err := util.UserCan(d.KubeClient.Authorization(), &authorization.ResourceAttributes{ Verb: "list", Group: kapi.GroupName, Resource: "nodes", diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/node_definitions.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/node_definitions.go index fd224ea6c001..a49de161a14a 100644 --- a/pkg/oc/admin/diagnostics/diagnostics/cluster/node_definitions.go +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/node_definitions.go @@ -14,6 +14,7 @@ import ( "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/log" "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/types" + "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/util" ) const ( @@ -68,7 +69,7 @@ func (d *NodeDefinitions) CanRun() (bool, error) { if d.KubeClient == nil { return false, errors.New("must have kube client") } - can, err := userCan(d.KubeClient.Authorization(), &authorization.ResourceAttributes{ + can, err := util.UserCan(d.KubeClient.Authorization(), &authorization.ResourceAttributes{ Verb: "list", Group: kapi.GroupName, Resource: "nodes", diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/registry.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/registry.go index dd280f5a4db4..b6a8c73d626d 100644 --- a/pkg/oc/admin/diagnostics/diagnostics/cluster/registry.go +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/registry.go @@ -17,6 +17,7 @@ import ( osapi "github.com/openshift/origin/pkg/image/apis/image" imagetypedclient "github.com/openshift/origin/pkg/image/generated/internalclientset/typed/image/internalversion" "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/types" + "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/util" ) // ClusterRegistry is a Diagnostic to check that there is a working Docker registry. @@ -162,7 +163,7 @@ func (d *ClusterRegistry) CanRun() (bool, error) { if d.ImageStreamClient == nil || d.KubeClient == nil { return false, fmt.Errorf("must have kube and os clients") } - return userCan(d.KubeClient.Authorization(), &authorization.ResourceAttributes{ + return util.UserCan(d.KubeClient.Authorization(), &authorization.ResourceAttributes{ Namespace: metav1.NamespaceDefault, Verb: "get", Group: kapi.GroupName, diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/rolebindings.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/rolebindings.go index f5723d497701..7d1bb515731f 100644 --- a/pkg/oc/admin/diagnostics/diagnostics/cluster/rolebindings.go +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/rolebindings.go @@ -11,8 +11,9 @@ import ( authorizationapi "github.com/openshift/origin/pkg/authorization/apis/authorization" oauthorizationtypedclient "github.com/openshift/origin/pkg/authorization/generated/internalclientset/typed/authorization/internalversion" - "github.com/openshift/origin/pkg/authorization/registry/util" + regutil "github.com/openshift/origin/pkg/authorization/registry/util" "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/types" + "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/util" policycmd "github.com/openshift/origin/pkg/oc/admin/policy" ) @@ -46,7 +47,7 @@ func (d *ClusterRoleBindings) CanRun() (bool, error) { return false, fmt.Errorf("must have client.SubjectAccessReviews") } - return userCan(d.SARClient, &authorization.ResourceAttributes{ + return util.UserCan(d.SARClient, &authorization.ResourceAttributes{ Verb: "list", Group: authorizationapi.GroupName, Resource: "clusterrolebindings", @@ -88,7 +89,7 @@ func (d *ClusterRoleBindings) Check() types.DiagnosticResult { r.Error("CRBD1002", err, fmt.Sprintf("Unable to get clusterrolebinding/%s: %v", changedClusterRoleBinding.Name, err)) continue } - actualRBACClusterRole, err := util.ClusterRoleBindingToRBAC(actualClusterRole) + actualRBACClusterRole, err := regutil.ClusterRoleBindingToRBAC(actualClusterRole) if err != nil { r.Error("CRBD1008", err, fmt.Sprintf("Unable to convert clusterrolebinding/%s to RBAC: %v", actualClusterRole.Name, err)) continue diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/roles.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/roles.go index 30b0880b2531..5415e664525a 100644 --- a/pkg/oc/admin/diagnostics/diagnostics/cluster/roles.go +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/roles.go @@ -11,8 +11,9 @@ import ( authorizationapi "github.com/openshift/origin/pkg/authorization/apis/authorization" oauthorizationtypedclient "github.com/openshift/origin/pkg/authorization/generated/internalclientset/typed/authorization/internalversion" - "github.com/openshift/origin/pkg/authorization/registry/util" + regutil "github.com/openshift/origin/pkg/authorization/registry/util" "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/types" + "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/util" policycmd "github.com/openshift/origin/pkg/oc/admin/policy" rbacregistryvalidation "k8s.io/kubernetes/pkg/registry/rbac/validation" ) @@ -74,7 +75,7 @@ func (d *ClusterRoles) CanRun() (bool, error) { return false, fmt.Errorf("must have client.SubjectAccessReviews") } - return userCan(d.SARClient, &authorization.ResourceAttributes{ + return util.UserCan(d.SARClient, &authorization.ResourceAttributes{ Verb: "list", Group: authorizationapi.GroupName, Resource: "clusterroles", @@ -113,7 +114,7 @@ func (d *ClusterRoles) Check() types.DiagnosticResult { continue } - actualRBACClusterRole, err := util.ClusterRoleToRBAC(actualClusterRole) + actualRBACClusterRole, err := regutil.ClusterRoleToRBAC(actualClusterRole) if err != nil { r.Error("CRD1009", err, fmt.Sprintf("Unable to convert clusterrole/%s to RBAC cluster role: %v", actualClusterRole.Name, err)) continue diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/route_validation.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/route_validation.go index 1af00befde63..66c7a49a919f 100644 --- a/pkg/oc/admin/diagnostics/diagnostics/cluster/route_validation.go +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/route_validation.go @@ -13,6 +13,7 @@ import ( authorizationtypedclient "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset/typed/authorization/internalversion" "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/types" + "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/util" routeapi "github.com/openshift/origin/pkg/route/apis/route" "github.com/openshift/origin/pkg/route/apis/route/validation" clientset "github.com/openshift/origin/pkg/route/generated/internalclientset" @@ -52,7 +53,7 @@ func (d *RouteCertificateValidation) CanRun() (bool, error) { if d.RESTConfig == nil || d.SARClient == nil { return false, errors.New("must have Kube client configuration") } - can, err := userCan(d.SARClient, &authorization.ResourceAttributes{ + can, err := util.UserCan(d.SARClient, &authorization.ResourceAttributes{ Namespace: metav1.NamespaceAll, Verb: "get", Group: routeapi.GroupName, diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/router.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/router.go index 731c85808c06..c5ca8d83ec51 100644 --- a/pkg/oc/admin/diagnostics/diagnostics/cluster/router.go +++ b/pkg/oc/admin/diagnostics/diagnostics/cluster/router.go @@ -18,6 +18,7 @@ import ( appsapi "github.com/openshift/origin/pkg/apps/apis/apps" appstypedclient "github.com/openshift/origin/pkg/apps/generated/internalclientset/typed/apps/internalversion" "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/types" + "github.com/openshift/origin/pkg/oc/admin/diagnostics/diagnostics/util" "k8s.io/kubernetes/pkg/apis/authorization" ) @@ -99,7 +100,7 @@ func (d *ClusterRouter) CanRun() (bool, error) { if d.KubeClient == nil || d.DCClient == nil { return false, errors.New("must have kube and os client") } - can, err := userCan(d.KubeClient.Authorization(), &authorization.ResourceAttributes{ + can, err := util.UserCan(d.KubeClient.Authorization(), &authorization.ResourceAttributes{ Namespace: metav1.NamespaceDefault, Verb: "get", Group: appsapi.GroupName, diff --git a/pkg/oc/admin/diagnostics/diagnostics/cluster/util.go b/pkg/oc/admin/diagnostics/diagnostics/cluster/util.go deleted file mode 100644 index 6d1f9237992f..000000000000 --- a/pkg/oc/admin/diagnostics/diagnostics/cluster/util.go +++ /dev/null @@ -1,23 +0,0 @@ -package cluster - -import ( - "k8s.io/kubernetes/pkg/apis/authorization" - authorizationtypedclient "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset/typed/authorization/internalversion" -) - -func userCan(sarClient authorizationtypedclient.SelfSubjectAccessReviewsGetter, action *authorization.ResourceAttributes) (bool, error) { - resp, err := sarClient.SelfSubjectAccessReviews().Create(&authorization.SelfSubjectAccessReview{ - Spec: authorization.SelfSubjectAccessReviewSpec{ - ResourceAttributes: action, - }, - }) - if err != nil { - return false, err - } - - if resp.Status.Allowed { - return true, nil - } - - return false, nil -} diff --git a/pkg/oc/admin/diagnostics/diagnostics/util/util.go b/pkg/oc/admin/diagnostics/diagnostics/util/util.go index e78bc91594bc..aed654dccc02 100644 --- a/pkg/oc/admin/diagnostics/diagnostics/util/util.go +++ b/pkg/oc/admin/diagnostics/diagnostics/util/util.go @@ -1,5 +1,10 @@ package util +import ( + "k8s.io/kubernetes/pkg/apis/authorization" + authorizationtypedclient "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset/typed/authorization/internalversion" +) + var ( AdminKubeConfigPaths = []string{ "/etc/openshift/master/admin.kubeconfig", // enterprise @@ -7,3 +12,20 @@ var ( "./openshift.local.config/master/admin.kubeconfig", // origin binary } ) + +func UserCan(sarClient authorizationtypedclient.SelfSubjectAccessReviewsGetter, action *authorization.ResourceAttributes) (bool, error) { + resp, err := sarClient.SelfSubjectAccessReviews().Create(&authorization.SelfSubjectAccessReview{ + Spec: authorization.SelfSubjectAccessReviewSpec{ + ResourceAttributes: action, + }, + }) + if err != nil { + return false, err + } + + if resp.Status.Allowed { + return true, nil + } + + return false, nil +} diff --git a/pkg/oc/admin/project/new_project.go b/pkg/oc/admin/project/new_project.go index cfbc4a36e549..f146f311ce4a 100644 --- a/pkg/oc/admin/project/new_project.go +++ b/pkg/oc/admin/project/new_project.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "io" + "os" "time" "github.com/spf13/cobra" @@ -40,6 +41,8 @@ type NewProjectOptions struct { AdminRole string AdminUser string + + Output io.Writer } var newProjectLong = templates.LongDesc(` @@ -126,7 +129,12 @@ func (o *NewProjectOptions) Run(useNodeSelector bool) error { return err } - fmt.Printf("Created project %v\n", o.ProjectName) + output := o.Output + if output == nil { + output = os.Stdout + } + + fmt.Fprintf(output, "Created project %v\n", o.ProjectName) errs := []error{} if len(o.AdminUser) != 0 { @@ -137,7 +145,7 @@ func (o *NewProjectOptions) Run(useNodeSelector bool) error { } if err := adduser.AddRole(); err != nil { - fmt.Printf("%v could not be added to the %v role: %v\n", o.AdminUser, o.AdminRole, err) + fmt.Fprintf(output, "%v could not be added to the %v role: %v\n", o.AdminUser, o.AdminRole, err) errs = append(errs, err) } else { if err := wait.PollImmediate(time.Second, time.Minute, func() (bool, error) { @@ -166,7 +174,7 @@ func (o *NewProjectOptions) Run(useNodeSelector bool) error { for _, rbacBinding := range bootstrappolicy.GetBootstrapServiceAccountProjectRoleBindings(o.ProjectName) { binding, err := authorizationregistryutil.RoleBindingFromRBAC(&rbacBinding) if err != nil { - fmt.Printf("Could not convert Role Binding %s in the %q namespace: %v\n", rbacBinding.Name, o.ProjectName, err) + fmt.Fprintf(output, "Could not convert Role Binding %s in the %q namespace: %v\n", rbacBinding.Name, o.ProjectName, err) errs = append(errs, err) continue } @@ -177,7 +185,7 @@ func (o *NewProjectOptions) Run(useNodeSelector bool) error { Subjects: binding.Subjects, } if err := addRole.AddRole(); err != nil { - fmt.Printf("Could not add service accounts to the %v role: %v\n", binding.RoleRef.Name, err) + fmt.Fprintf(output, "Could not add service accounts to the %v role: %v\n", binding.RoleRef.Name, err) errs = append(errs, err) } }