From e1295950d6722cf8f070ab6bcb92889a3be941fb Mon Sep 17 00:00:00 2001 From: Alex McGrath Date: Thu, 27 Apr 2023 13:24:56 +0100 Subject: [PATCH 1/5] Log troubleshooting information when InvalidInstanceID errors are found --- docs/pages/server-access/guides/ec2-discovery.mdx | 9 +++++++++ lib/srv/discovery/discovery.go | 8 +++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/docs/pages/server-access/guides/ec2-discovery.mdx b/docs/pages/server-access/guides/ec2-discovery.mdx index 3783beebd4e26..65c3154c0f7d5 100644 --- a/docs/pages/server-access/guides/ec2-discovery.mdx +++ b/docs/pages/server-access/guides/ec2-discovery.mdx @@ -393,6 +393,15 @@ error json: cannot unmarshal object into Go struct field DownloadContentPlugin.s It is likely that you're running an older SSM agent version. Upgrade to SSM agent version 3.1 or greater to resolve. +### `InvalidInstanceId: Instances [[i-123]] not in a valid state for account 456` + +The following problems can cause this exception: +- The discovery node don't have permission to access the managed node. +- AWS Systems Manager Agent(SSM Agent) isn't running. Verify that SSM Agent is running. +- SSM Agent isn't registered with the SSM endpoint. Try reinstalling SSM Agent. +- The discovered instance does not have permission to recieve SSM + commands, verify the instance includes the AmazonSSMManagedInstanceCore IAM policy. + ## Next steps - Read [Joining Nodes via AWS IAM Role](../../management/guides/joining-nodes-aws-iam.mdx) diff --git a/lib/srv/discovery/discovery.go b/lib/srv/discovery/discovery.go index 0c8ce38a11b5d..9889c846cc8a0 100644 --- a/lib/srv/discovery/discovery.go +++ b/lib/srv/discovery/discovery.go @@ -18,12 +18,15 @@ package discovery import ( "context" + "errors" "fmt" "strings" "time" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/ec2" + "github.com/aws/aws-sdk-go/service/ssm" "github.com/gravitational/trace" "github.com/sirupsen/logrus" @@ -331,7 +334,10 @@ func (s *Server) handleEC2Discovery() { s.Log.Debugf("EC2 instances discovered (AccountID: %s, Instances: %v), starting installation", instances.AccountID, genInstancesLogStr(instances.Instances)) if err := s.handleInstances(&instances); err != nil { - if trace.IsNotFound(err) { + var aErr awserr.Error + if errors.As(err, &aErr) && aErr.Code() == ssm.ErrCodeInvalidInstanceId { + s.Log.WithError(err).Error("Invalid instance ID found. This can happen if the instance doesnt have a running SSM agent that is registered with the SSM endpoint (may require reinstalling the SSM Agent, or giving the instance IAM permissions to receive SSM commands), or the discovery instance doesnt have permissions to access the node.") + } else if trace.IsNotFound(err) { s.Log.Debug("All discovered EC2 instances are already part of the cluster.") } else { s.Log.WithError(err).Error("Failed to enroll discovered EC2 instances.") From 1e966f6ce540d86211e7d241d0a40646445fa008 Mon Sep 17 00:00:00 2001 From: Alex McGrath Date: Thu, 27 Apr 2023 15:17:19 +0100 Subject: [PATCH 2/5] resolve comments --- docs/pages/server-access/guides/ec2-discovery.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/pages/server-access/guides/ec2-discovery.mdx b/docs/pages/server-access/guides/ec2-discovery.mdx index 65c3154c0f7d5..347dab6b52350 100644 --- a/docs/pages/server-access/guides/ec2-discovery.mdx +++ b/docs/pages/server-access/guides/ec2-discovery.mdx @@ -395,11 +395,11 @@ It is likely that you're running an older SSM agent version. Upgrade to SSM agen ### `InvalidInstanceId: Instances [[i-123]] not in a valid state for account 456` -The following problems can cause this exception: -- The discovery node don't have permission to access the managed node. -- AWS Systems Manager Agent(SSM Agent) isn't running. Verify that SSM Agent is running. +The following problems can cause this error: +- The Discovery Service doesn't have permission to access the managed node. +- AWS Systems Manager Agent (SSM Agent) isn't running. Verify that SSM Agent is running. - SSM Agent isn't registered with the SSM endpoint. Try reinstalling SSM Agent. -- The discovered instance does not have permission to recieve SSM +- The discovered instance does not have permission to receive SSM commands, verify the instance includes the AmazonSSMManagedInstanceCore IAM policy. ## Next steps From 41513ecf425de4f6844d4ec0d3acbf626c9422fc Mon Sep 17 00:00:00 2001 From: Alex McGrath Date: Tue, 2 May 2023 11:09:40 +0100 Subject: [PATCH 3/5] resolve comments --- lib/srv/discovery/discovery.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/srv/discovery/discovery.go b/lib/srv/discovery/discovery.go index 9889c846cc8a0..3baeea1346dd4 100644 --- a/lib/srv/discovery/discovery.go +++ b/lib/srv/discovery/discovery.go @@ -336,7 +336,7 @@ func (s *Server) handleEC2Discovery() { if err := s.handleInstances(&instances); err != nil { var aErr awserr.Error if errors.As(err, &aErr) && aErr.Code() == ssm.ErrCodeInvalidInstanceId { - s.Log.WithError(err).Error("Invalid instance ID found. This can happen if the instance doesnt have a running SSM agent that is registered with the SSM endpoint (may require reinstalling the SSM Agent, or giving the instance IAM permissions to receive SSM commands), or the discovery instance doesnt have permissions to access the node.") + s.Log.WithError(err).Error("Invalid instance ID found. This can happen if the instance does not have a running SSM agent registered with the SSM endpoint (may require reinstalling the SSM Agent, or giving the instance IAM permissions to receive SSM commands), or the discovery instance does not have permissions to access the node.") } else if trace.IsNotFound(err) { s.Log.Debug("All discovered EC2 instances are already part of the cluster.") } else { From ece8ddae6e68058760236a84162ce6c2eca537b0 Mon Sep 17 00:00:00 2001 From: Alex McGrath Date: Thu, 4 May 2023 11:35:18 +0100 Subject: [PATCH 4/5] resolve comments --- docs/pages/server-access/guides/ec2-discovery.mdx | 4 ++++ lib/srv/discovery/discovery.go | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/pages/server-access/guides/ec2-discovery.mdx b/docs/pages/server-access/guides/ec2-discovery.mdx index 347dab6b52350..9713c97cd0539 100644 --- a/docs/pages/server-access/guides/ec2-discovery.mdx +++ b/docs/pages/server-access/guides/ec2-discovery.mdx @@ -402,6 +402,10 @@ The following problems can cause this error: - The discovered instance does not have permission to receive SSM commands, verify the instance includes the AmazonSSMManagedInstanceCore IAM policy. +See SSM RunCommand error codes and troubleshooting information in AWS documentation for more details: +- https://docs.aws.amazon.com/systems-manager/latest/userguide/troubleshooting-managed-instances.html +- https://docs.aws.amazon.com/systems-manager/latest/APIReference/API_SendCommand.html#API_SendCommand_Errors + ## Next steps - Read [Joining Nodes via AWS IAM Role](../../management/guides/joining-nodes-aws-iam.mdx) diff --git a/lib/srv/discovery/discovery.go b/lib/srv/discovery/discovery.go index 3baeea1346dd4..793ab0c549ada 100644 --- a/lib/srv/discovery/discovery.go +++ b/lib/srv/discovery/discovery.go @@ -336,7 +336,7 @@ func (s *Server) handleEC2Discovery() { if err := s.handleInstances(&instances); err != nil { var aErr awserr.Error if errors.As(err, &aErr) && aErr.Code() == ssm.ErrCodeInvalidInstanceId { - s.Log.WithError(err).Error("Invalid instance ID found. This can happen if the instance does not have a running SSM agent registered with the SSM endpoint (may require reinstalling the SSM Agent, or giving the instance IAM permissions to receive SSM commands), or the discovery instance does not have permissions to access the node.") + s.Log.WithError(err).Error("SSM RunCommand failed with ErrCodeInvalidInstanceId. Make sure that the instances have AmazonSSMManagedInstanceCore policy assigned. Also check that SSM agent is running and registered with the SSM endpoint on that instance and try restarting or reinstalling it in case of issues. See https://docs.aws.amazon.com/systems-manager/latest/APIReference/API_SendCommand.html#API_SendCommand_Errors for more details.") } else if trace.IsNotFound(err) { s.Log.Debug("All discovered EC2 instances are already part of the cluster.") } else { From 580ca2a2804f2cc1c910d10bba8b97cbecb644ba Mon Sep 17 00:00:00 2001 From: Alex McGrath Date: Thu, 4 May 2023 16:18:23 +0100 Subject: [PATCH 5/5] fix typo --- lib/srv/discovery/discovery.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/srv/discovery/discovery.go b/lib/srv/discovery/discovery.go index 793ab0c549ada..0d8d0b3d47bd7 100644 --- a/lib/srv/discovery/discovery.go +++ b/lib/srv/discovery/discovery.go @@ -336,7 +336,7 @@ func (s *Server) handleEC2Discovery() { if err := s.handleInstances(&instances); err != nil { var aErr awserr.Error if errors.As(err, &aErr) && aErr.Code() == ssm.ErrCodeInvalidInstanceId { - s.Log.WithError(err).Error("SSM RunCommand failed with ErrCodeInvalidInstanceId. Make sure that the instances have AmazonSSMManagedInstanceCore policy assigned. Also check that SSM agent is running and registered with the SSM endpoint on that instance and try restarting or reinstalling it in case of issues. See https://docs.aws.amazon.com/systems-manager/latest/APIReference/API_SendCommand.html#API_SendCommand_Errors for more details.") + s.Log.WithError(err).Error("SSM SendCommand failed with ErrCodeInvalidInstanceId. Make sure that the instances have AmazonSSMManagedInstanceCore policy assigned. Also check that SSM agent is running and registered with the SSM endpoint on that instance and try restarting or reinstalling it in case of issues. See https://docs.aws.amazon.com/systems-manager/latest/APIReference/API_SendCommand.html#API_SendCommand_Errors for more details.") } else if trace.IsNotFound(err) { s.Log.Debug("All discovered EC2 instances are already part of the cluster.") } else {