-
Notifications
You must be signed in to change notification settings - Fork 1.5k
New gather subcommand to assist debugging bootstrap failures. #1627
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,234 @@ | ||
| package main | ||
|
|
||
| import ( | ||
| "encoding/json" | ||
| "fmt" | ||
| "io/ioutil" | ||
| "os" | ||
| "path/filepath" | ||
| "strings" | ||
|
|
||
| "github.com/pkg/errors" | ||
| "github.com/sirupsen/logrus" | ||
| "github.com/spf13/cobra" | ||
| "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" | ||
|
|
||
| "github.com/openshift/installer/pkg/asset/installconfig" | ||
| assetstore "github.com/openshift/installer/pkg/asset/store" | ||
| "github.com/openshift/installer/pkg/terraform" | ||
| "github.com/openshift/installer/pkg/types" | ||
| awstypes "github.com/openshift/installer/pkg/types/aws" | ||
| libvirttypes "github.com/openshift/installer/pkg/types/libvirt" | ||
| openstacktypes "github.com/openshift/installer/pkg/types/openstack" | ||
| ) | ||
|
|
||
| func newGatherCmd() *cobra.Command { | ||
| cmd := &cobra.Command{ | ||
| Use: "gather", | ||
| Short: "Gather debugging data for a given installation failure", | ||
| Long: `Gather debugging data for a given installation failure. | ||
|
|
||
| When installation for Openshift cluster fails, gathering all the data useful for debugging can | ||
| become a difficult task. This command helps users to collect the most relevant information that can be used | ||
| to debug the installation failures`, | ||
| RunE: func(cmd *cobra.Command, args []string) error { | ||
| return cmd.Help() | ||
| }, | ||
| } | ||
| cmd.AddCommand(newGatherBootstrapCmd()) | ||
| return cmd | ||
| } | ||
|
|
||
| var ( | ||
| gatherBootstrapOpts struct { | ||
| bootstrap string | ||
| masters []string | ||
| } | ||
| ) | ||
|
|
||
| func newGatherBootstrapCmd() *cobra.Command { | ||
| cmd := &cobra.Command{ | ||
| Use: "bootstrap", | ||
| Short: "Gather debugging data for a failing to bootstrap control plane", | ||
| Args: cobra.ExactArgs(0), | ||
| Run: func(_ *cobra.Command, _ []string) { | ||
| cleanup := setupFileHook(rootOpts.dir) | ||
| defer cleanup() | ||
| err := runGatherBootstrapCmd(rootOpts.dir) | ||
| if err != nil { | ||
| logrus.Fatal(err) | ||
| } | ||
| }, | ||
| } | ||
| cmd.PersistentFlags().StringVar(&gatherBootstrapOpts.bootstrap, "bootstrap", "", "Hostname or IP of the bootstrap host") | ||
| cmd.PersistentFlags().StringArrayVar(&gatherBootstrapOpts.masters, "master", []string{}, "Hostnames or IPs of all control plane hosts") | ||
| return cmd | ||
| } | ||
|
|
||
| func runGatherBootstrapCmd(directory string) error { | ||
| tfStateFilePath := filepath.Join(directory, terraform.StateFileName) | ||
| _, err := os.Stat(tfStateFilePath) | ||
| if os.IsNotExist(err) { | ||
| return unSupportedPlatformGather() | ||
| } | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| assetStore, err := assetstore.NewStore(directory) | ||
| if err != nil { | ||
| return errors.Wrap(err, "failed to create asset store") | ||
| } | ||
|
|
||
| config := &installconfig.InstallConfig{} | ||
| if err := assetStore.Fetch(config); err != nil { | ||
| return errors.Wrapf(err, "failed to fetch %s", config.Name()) | ||
| } | ||
|
|
||
| sfRaw, err := ioutil.ReadFile(tfStateFilePath) | ||
| if err != nil { | ||
| return errors.Wrapf(err, "failed to read %q", tfStateFilePath) | ||
| } | ||
|
|
||
| var tfstate terraformState | ||
| if err := json.Unmarshal(sfRaw, &tfstate); err != nil { | ||
| return errors.Wrapf(err, "failed to unmarshal %q", tfStateFilePath) | ||
| } | ||
|
|
||
| bootstrap, masters, err := extractHostAddresses(config.Config, tfstate) | ||
| if err != nil { | ||
| if err2, ok := err.(errUnSupportedGatherPlatform); ok { | ||
| logrus.Error(err2) | ||
| return unSupportedPlatformGather() | ||
| } | ||
| return errors.Wrapf(err, "failed to get bootstrap and control plane host addresses from %q", tfStateFilePath) | ||
| } | ||
|
|
||
| logGatherBootstrap(bootstrap, masters) | ||
| return nil | ||
| } | ||
|
|
||
| func logGatherBootstrap(bootstrap string, masters []string) { | ||
| if s, ok := os.LookupEnv("SSH_AUTH_SOCK"); !ok || s == "" { | ||
| logrus.Info("Make sure ssh-agent is running, env SSH_AUTH_SOCK is set to the ssh-agent's UNIX socket and your private key is added to the agent.") | ||
| } | ||
| logrus.Info("Use the following commands to gather logs from the cluster") | ||
| logrus.Infof("ssh -A core@%s '/usr/local/bin/installer-gather.sh %s'", bootstrap, strings.Join(masters, " ")) | ||
| logrus.Infof("scp core@%s:~/log-bundle.tar.gz .", bootstrap) | ||
| } | ||
|
|
||
| func extractHostAddresses(config *types.InstallConfig, tfstate terraformState) (bootstrap string, masters []string, err error) { | ||
| mcount := *config.ControlPlane.Replicas | ||
| switch config.Platform.Name() { | ||
| case awstypes.Name: | ||
| bm := tfstate.Modules["root/bootstrap"] | ||
| bootstrap, _, err = unstructured.NestedString(bm.Resources["aws_instance.bootstrap"], "primary", "attributes", "public_ip") | ||
| if err != nil { | ||
| return bootstrap, masters, errors.Wrapf(err, "failed to get bootstrap host addresses") | ||
| } | ||
|
|
||
| mm := tfstate.Modules["root/masters"] | ||
| for idx := int64(0); idx < mcount; idx++ { | ||
| r := fmt.Sprintf("aws_instance.master.%d", idx) | ||
| if mcount == 1 { | ||
| r = "aws_instance.master" | ||
| } | ||
| var master string | ||
| master, _, err = unstructured.NestedString(mm.Resources[r], "primary", "attributes", "private_ip") | ||
| if err != nil { | ||
| return bootstrap, masters, errors.Wrapf(err, "failed to get master host addresses") | ||
| } | ||
| masters = append(masters, master) | ||
| } | ||
| case libvirttypes.Name: | ||
| bm := tfstate.Modules["root/bootstrap"] | ||
| bootstrap, _, err = unstructured.NestedString(bm.Resources["libvirt_domain.bootstrap"], "primary", "attributes", "network_interface.0.hostname") | ||
| if err != nil { | ||
| return bootstrap, masters, errors.Wrapf(err, "failed to get bootstrap host addresses") | ||
| } | ||
|
|
||
| rm := tfstate.Modules["root"] | ||
| for idx := int64(0); idx < mcount; idx++ { | ||
| r := fmt.Sprintf("libvirt_domain.master.%d", idx) | ||
| if mcount == 1 { | ||
| r = "libvirt_domain.master" | ||
| } | ||
| var master string | ||
| master, _, err = unstructured.NestedString(rm.Resources[r], "primary", "attributes", "network_interface.0.hostname") | ||
| if err != nil { | ||
| return bootstrap, masters, errors.Wrapf(err, "failed to get master host addresses") | ||
| } | ||
| masters = append(masters, master) | ||
| } | ||
| case openstacktypes.Name: | ||
| bm := tfstate.Modules["root/bootstrap"] | ||
| bootstrap, _, err = unstructured.NestedString(bm.Resources["openstack_compute_instance_v2.bootstrap"], "primary", "attributes", "access_ip_v4") | ||
| if err != nil { | ||
| return bootstrap, masters, errors.Wrapf(err, "failed to get bootstrap host addresses") | ||
| } | ||
|
|
||
| mm := tfstate.Modules["root/masters"] | ||
| for idx := int64(0); idx < mcount; idx++ { | ||
| r := fmt.Sprintf("openstack_compute_instance_v2.master_conf.%d", idx) | ||
| if mcount == 1 { | ||
| r = "openstack_compute_instance_v2.master_conf" | ||
| } | ||
| var master string | ||
| master, _, err = unstructured.NestedString(mm.Resources[r], "primary", "attributes", "access_ip_v4") | ||
| if err != nil { | ||
| return bootstrap, masters, errors.Wrapf(err, "failed to get master host addresses") | ||
| } | ||
| masters = append(masters, master) | ||
| } | ||
| default: | ||
| return "", nil, errUnSupportedGatherPlatform{Message: fmt.Sprintf("Cannot fetch the bootstrap and control plane host addresses from state file for %s platform", config.Platform.Name())} | ||
| } | ||
| return bootstrap, masters, nil | ||
| } | ||
|
|
||
| type terraformState struct { | ||
| Modules map[string]terraformStateModule | ||
| } | ||
|
|
||
| type terraformStateModule struct { | ||
| Resources map[string]map[string]interface{} `json:"resources"` | ||
| } | ||
|
|
||
| func (tfs *terraformState) UnmarshalJSON(raw []byte) error { | ||
| var transform struct { | ||
| Modules []struct { | ||
| Path []string `json:"path"` | ||
| terraformStateModule | ||
| } `json:"modules"` | ||
| } | ||
| if err := json.Unmarshal(raw, &transform); err != nil { | ||
| return err | ||
| } | ||
| if tfs == nil { | ||
| tfs = &terraformState{} | ||
| } | ||
| if tfs.Modules == nil { | ||
| tfs.Modules = make(map[string]terraformStateModule) | ||
| } | ||
| for _, m := range transform.Modules { | ||
| tfs.Modules[strings.Join(m.Path, "/")] = terraformStateModule{Resources: m.Resources} | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| type errUnSupportedGatherPlatform struct { | ||
| Message string | ||
| } | ||
|
|
||
| func (e errUnSupportedGatherPlatform) Error() string { | ||
| return e.Message | ||
| } | ||
|
|
||
| func unSupportedPlatformGather() error { | ||
| if gatherBootstrapOpts.bootstrap == "" || len(gatherBootstrapOpts.masters) == 0 { | ||
| return errors.New("boostrap host address and at least one control plane host address must be provided") | ||
| } | ||
|
|
||
| logGatherBootstrap(gatherBootstrapOpts.bootstrap, gatherBootstrapOpts.masters) | ||
| return nil | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,6 @@ | ||
| #!/usr/bin/env bash | ||
| set -eo pipefail | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This completely removes error checking, no? What happens if, for example,
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should try to make sure this script never fails and gathers as much as it can.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also the bootstrap node is very controlled env so things like mkdir failing, we'll probably catch in our CI ;)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
If we look and notice that things are missing. I'm unlikely to be that observant unless quite a lot is missing ;). |
||
|
|
||
| ARTIFACTS="${1:-/tmp/artifacts}" | ||
| ARTIFACTS="/tmp/artifacts" | ||
|
|
||
| echo "Gathering bootstrap journals ..." | ||
| mkdir -p "${ARTIFACTS}/bootstrap/journals" | ||
|
|
@@ -15,8 +14,8 @@ mkdir -p "${ARTIFACTS}/bootstrap/containers" | |
| sudo crictl ps --all --quiet | while read -r container | ||
| do | ||
| container_name="$(sudo crictl ps -a --id "${container}" -v | grep -oP "Name: \\K(.*)")" | ||
| sudo crictl logs "${container}" >& "${ARTIFACTS}/bootstrap/containers/${container_name}.log" || true | ||
| sudo crictl inspect "${container}" >& "${ARTIFACTS}/bootstrap/containers/${container_name}.inspect" || true | ||
| sudo crictl logs "${container}" >& "${ARTIFACTS}/bootstrap/containers/${container_name}.log" | ||
| sudo crictl inspect "${container}" >& "${ARTIFACTS}/bootstrap/containers/${container_name}.inspect" | ||
| done | ||
| mkdir -p "${ARTIFACTS}/bootstrap/pods" | ||
| sudo podman ps --all --quiet | while read -r container | ||
|
|
@@ -81,8 +80,9 @@ wait | |
|
|
||
| echo "Gather remote logs" | ||
| export MASTERS=() | ||
| if [ "$(stat --printf="%s" "${ARTIFACTS}/resources/masters.list")" -ne "0" ] | ||
| then | ||
| if [ "$#" -ne 0 ]; then | ||
| MASTERS=( "$@" ) | ||
| elif [ "$(stat --printf="%s" "${ARTIFACTS}/resources/masters.list")" -ne "0" ]; then | ||
| # shellcheck disable=SC2030 | ||
| mapfile -t MASTERS < "${ARTIFACTS}/resources/masters.list" | ||
| else | ||
|
|
@@ -95,10 +95,10 @@ fi | |
| for master in "${MASTERS[@]}" | ||
| do | ||
| echo "Collecting info from ${master}" | ||
| scp -o PreferredAuthentications=publickey -o StrictHostKeyChecking=false -o UserKnownHostsFile=/dev/null /usr/local/bin/installer-masters-gather.sh "core@${master}:" || true | ||
| scp -o PreferredAuthentications=publickey -o StrictHostKeyChecking=false -o UserKnownHostsFile=/dev/null /usr/local/bin/installer-masters-gather.sh "core@${master}:" | ||
| mkdir -p "${ARTIFACTS}/control-plane/${master}" | ||
| ssh -o PreferredAuthentications=publickey -o StrictHostKeyChecking=false -o UserKnownHostsFile=/dev/null "core@${master}" -C 'sudo ./installer-masters-gather.sh' </dev/null || true | ||
| ssh -o PreferredAuthentications=publickey -o StrictHostKeyChecking=false -o UserKnownHostsFile=/dev/null "core@${master}" -C 'sudo tar c -C /tmp/artifacts/ .' </dev/null | tar -x -C "${ARTIFACTS}/control-plane/${master}/" || true | ||
| ssh -o PreferredAuthentications=publickey -o StrictHostKeyChecking=false -o UserKnownHostsFile=/dev/null "core@${master}" -C 'sudo ./installer-masters-gather.sh' </dev/null | ||
| ssh -o PreferredAuthentications=publickey -o StrictHostKeyChecking=false -o UserKnownHostsFile=/dev/null "core@${master}" -C 'sudo tar c -C /tmp/artifacts/ .' </dev/null | tar -x -C "${ARTIFACTS}/control-plane/${master}/" | ||
| done | ||
| tar cz -C /tmp/artifacts . > ~/log-bundle.tar.gz | ||
| echo "Log bundle written to ~/log-bundle.tar.gz" | ||
Uh oh!
There was an error while loading. Please reload this page.