Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 31 additions & 19 deletions api/gen/proto/go/teleport/machineid/v1/bot_instance_service.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions api/proto/teleport/machineid/v1/bot_instance_service.proto
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ message DeleteBotInstanceRequest {
message SubmitHeartbeatRequest {
// The heartbeat data to submit.
BotInstanceStatusHeartbeat heartbeat = 1;

// The health of the services/output `tbot` is running.
repeated BotInstanceServiceHealth service_health = 2;
}

// The response for SubmitHeartbeat.
Expand Down
36 changes: 36 additions & 0 deletions lib/auth/machineid/machineidv1/bot_instance_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ package machineidv1
import (
"context"
"log/slog"
"os"
"strconv"
"time"

"github.com/gravitational/trace"
Expand All @@ -46,6 +48,12 @@ const (
// ensure the instance remains accessible until shortly after the last
// issued certificate expires.
ExpiryMargin = time.Minute * 5

// serviceNameLimit is the maximum length in bytes of a bot service name.
serviceNameLimit = 64

// statusReasonLimit is the maximum length in bytes of a service status reason.
statusReasonLimit = 256
)

// BotInstancesCache is the subset of the cached resources that the Service queries.
Expand Down Expand Up @@ -204,6 +212,17 @@ func (b *BotInstanceService) SubmitHeartbeat(ctx context.Context, req *pb.Submit
return nil, trace.BadParameter("heartbeat: must be non-nil")
}

for _, svcHealth := range req.GetServiceHealth() {
name := svcHealth.GetService().GetName()
if len(name) > serviceNameLimit {
return nil, trace.BadParameter("service name %q is longer than %d bytes", name, serviceNameLimit)
}
reason := svcHealth.GetReason()
if len(reason) > statusReasonLimit {
return nil, trace.BadParameter("service %q has a status reason longer than %d bytes", name, statusReasonLimit)
}
}

// Enforce that the connecting client is a bot and has a bot instance ID.
botName := authCtx.Identity.GetIdentity().BotName
botInstanceID := authCtx.Identity.GetIdentity().BotInstanceID
Expand Down Expand Up @@ -238,6 +257,11 @@ func (b *BotInstanceService) SubmitHeartbeat(ctx context.Context, req *pb.Submit
// Append the new heartbeat to the end.
instance.Status.LatestHeartbeats = append(instance.Status.LatestHeartbeats, req.Heartbeat)

if storeHeartbeatExtras() {
// Overwrite the service health.
instance.Status.ServiceHealth = req.ServiceHealth
}
Comment on lines +260 to +263
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels like we should give this some test coverage. It's the sort of thing nobody uses or thinks about until the day when it's really needed and wont work because of some subtle regression which snuck in over time.


return instance, nil
})
if err != nil {
Expand All @@ -246,3 +270,15 @@ func (b *BotInstanceService) SubmitHeartbeat(ctx context.Context, req *pb.Submit

return &pb.SubmitHeartbeatResponse{}, nil
}

// storeHeartbeatExtras returns whether we should store "extra" data submitted
// with tbot heartbeats, such as the service health. Defaults to true unless the
// TELEPORT_DISABLE_TBOT_HEARTBEAT_EXTRAS environment variable is set to true on
// the auth server.
func storeHeartbeatExtras() bool {
disabled, err := strconv.ParseBool(os.Getenv("TELEPORT_DISABLE_TBOT_HEARTBEAT_EXTRAS"))
if err != nil {
return true
}
return !disabled
}
79 changes: 79 additions & 0 deletions lib/auth/machineid/machineidv1/bot_instance_service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"fmt"
"slices"
"strconv"
"strings"
"testing"

"github.com/google/go-cmp/cmp"
Expand Down Expand Up @@ -265,6 +266,7 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) {
createBotInstance bool
assertErr assert.ErrorAssertionFunc
wantHeartbeat bool
wantServiceHealth []*machineidv1.BotInstanceServiceHealth
}{
{
name: "success",
Expand All @@ -273,10 +275,30 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) {
Heartbeat: &machineidv1.BotInstanceStatusHeartbeat{
Hostname: "llama",
},
ServiceHealth: []*machineidv1.BotInstanceServiceHealth{
{
Service: &machineidv1.BotInstanceServiceIdentifier{
Type: "application-tunnel",
Name: "my-application-tunnel",
},
Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY,
Reason: ptr("application is broken"),
},
},
},
identity: goodIdentity,
assertErr: assert.NoError,
wantHeartbeat: true,
wantServiceHealth: []*machineidv1.BotInstanceServiceHealth{
{
Service: &machineidv1.BotInstanceServiceIdentifier{
Type: "application-tunnel",
Name: "my-application-tunnel",
},
Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY,
Reason: ptr("application is broken"),
},
},
},
{
name: "missing bot name",
Expand Down Expand Up @@ -335,6 +357,54 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) {
},
wantHeartbeat: false,
},
{
name: "service name too long",
createBotInstance: true,
req: &machineidv1.SubmitHeartbeatRequest{
Heartbeat: &machineidv1.BotInstanceStatusHeartbeat{
Hostname: "llama",
},
ServiceHealth: []*machineidv1.BotInstanceServiceHealth{
{
Service: &machineidv1.BotInstanceServiceIdentifier{
Type: "application-tunnel",
Name: strings.Repeat("a", 100),
},
Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY,
Reason: ptr("application is broken"),
},
},
},
identity: goodIdentity,
assertErr: func(t assert.TestingT, err error, i ...any) bool {
return assert.True(t, trace.IsBadParameter(err)) && assert.Contains(t, err.Error(), "is longer than 64 bytes")
},
wantHeartbeat: false,
},
{
name: "status reason too long",
createBotInstance: true,
req: &machineidv1.SubmitHeartbeatRequest{
Heartbeat: &machineidv1.BotInstanceStatusHeartbeat{
Hostname: "llama",
},
ServiceHealth: []*machineidv1.BotInstanceServiceHealth{
{
Service: &machineidv1.BotInstanceServiceIdentifier{
Type: "application-tunnel",
Name: "my-application-tunnel",
},
Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY,
Reason: ptr(strings.Repeat("a", 300)),
},
},
},
identity: goodIdentity,
assertErr: func(t assert.TestingT, err error, i ...any) bool {
return assert.True(t, trace.IsBadParameter(err)) && assert.Contains(t, err.Error(), "status reason longer than 256 bytes")
},
wantHeartbeat: false,
},
}

for _, tt := range tests {
Expand Down Expand Up @@ -385,6 +455,13 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) {
assert.Nil(t, bi.Status.InitialHeartbeat)
assert.Empty(t, bi.Status.LatestHeartbeats)
}
assert.Empty(t,
cmp.Diff(
bi.Status.ServiceHealth,
tt.wantServiceHealth,
protocmp.Transform(),
),
)
}
})
}
Expand Down Expand Up @@ -597,3 +674,5 @@ func newBotInstanceService(

return service
}

func ptr[T any](v T) *T { return &v }
4 changes: 2 additions & 2 deletions lib/tbot/bot/bot.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ func (b *Bot) Run(ctx context.Context) (err error) {
// statuses. Otherwise we will not include the service in heartbeats or
// the `/readyz` endpoint.
if handle.statusReporter.used {
handle.statusReporter.reporter = registry.AddService(handle.name)
handle.statusReporter.reporter = registry.AddService(handle.serviceType, handle.name)
}
}

Expand Down Expand Up @@ -146,7 +146,7 @@ func (b *Bot) OneShot(ctx context.Context) (err error) {
}

// Add oneshot services to the registry.
handle.statusReporter.reporter = registry.AddService(handle.name)
handle.statusReporter.reporter = registry.AddService(handle.serviceType, handle.name)
oneShotServices = append(oneShotServices, handle)
}

Expand Down
61 changes: 60 additions & 1 deletion lib/tbot/internal/heartbeat/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"log/slog"
"os"
"runtime"
"sort"
"time"

"github.com/gravitational/trace"
Expand Down Expand Up @@ -229,8 +230,20 @@ func (s *Service) heartbeat(ctx context.Context, isOneShot, isStartup bool) erro
Kind: s.cfg.BotKind,
}

status := s.cfg.StatusRegistry.OverallStatus()
serviceHealth := make([]*machineidv1pb.BotInstanceServiceHealth, 0, len(status.Services))
for name, serviceStatus := range status.Services {
serviceHealth = append(serviceHealth, statusToServiceHealth(name, serviceStatus))
}

// Sort the services by name to make tests deterministic.
sort.Slice(serviceHealth, func(a, b int) bool {
return serviceHealth[a].Service.Name < serviceHealth[b].Service.Name
})

_, err = s.cfg.Client.SubmitHeartbeat(ctx, &machineidv1pb.SubmitHeartbeatRequest{
Heartbeat: hb,
Heartbeat: hb,
ServiceHealth: serviceHealth,
})
if err != nil {
return trace.Wrap(err, "submitting heartbeat")
Expand All @@ -239,3 +252,49 @@ func (s *Service) heartbeat(ctx context.Context, isOneShot, isStartup bool) erro
s.cfg.Logger.InfoContext(ctx, "Sent heartbeat", "data", hb.String())
return nil
}

func statusToServiceHealth(name string, status *readyz.ServiceStatus) *machineidv1pb.BotInstanceServiceHealth {
health := &machineidv1pb.BotInstanceServiceHealth{
Service: &machineidv1pb.BotInstanceServiceIdentifier{
Name: trimString(name, 64),
Type: status.ServiceType,
},
}

switch status.Status {
case readyz.Initializing:
health.Status = machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_INITIALIZING
case readyz.Healthy:
health.Status = machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_HEALTHY
case readyz.Unhealthy:
health.Status = machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY
}

if status.Reason != "" {
reason := trimString(status.Reason, 256)
health.Reason = &reason
}

if status.UpdatedAt != nil {
health.UpdatedAt = timestamppb.New(*status.UpdatedAt)
}

return health
}

func trimString(s string, maxBytes int) string {
if len(s) <= maxBytes {
return s
}

// Trim the string to maxBytes, honoring rune boundaries for non-ASCII text.
byteCount := 0
for i, r := range s {
runeSize := len(string(r))
if byteCount+runeSize > maxBytes {
return s[:i]
}
byteCount += runeSize
}
return s
}
Loading
Loading