Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/spire-server/cli/healthcheck/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ func (c *healthCheckCommand) run() error {
}
return errors.New("cannot create registration client")
}
defer client.Release()

bundleClient := client.NewBundleClient()

// Currently using the ability to fetch a bundle as the health check. This
Expand Down
30 changes: 15 additions & 15 deletions conf/agent/agent_full.conf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# This is the SPIRE Agent configuration file including all possible configuration
# options.
# options.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: All the minor whitespace corrections in the doc comments distract from the main purpose of the pull request. I agree the corrections should be made, it would be easier to review in a separate commit.


# agent: Contains core configuration parameters.
agent {
Expand All @@ -24,16 +24,16 @@ agent {

# server_address: DNS name or IP address of the SPIRE server.
server_address = "127.0.0.1"

# server_port: Port number of the SPIRE server.
server_port = "8081"

# socket_path: Location to bind the workload API socket. Default: /tmp/agent.sock.
socket_path = "/tmp/agent.sock"

# trust_bundle_path: Path to the SPIRE server CA bundle.
trust_bundle_path = "./conf/agent/dummy_root_ca.crt"

# trust_bundle_url: URL to download the initial SPIRE server trust bundle.
# trust_bundle_url = ""

Expand All @@ -56,9 +56,9 @@ agent {
# Each nested object has the following format:
#
# PluginType "plugin_name" {
#
#
# # plugin_cmd: Path to the plugin implementation binary (optional, not
# # needed for built-ins)
# # needed for built-ins)
# plugin_cmd = <string>
#
# # plugin_checksum: An optional sha256 of the plugin binary (optional,
Expand Down Expand Up @@ -149,7 +149,7 @@ plugins {
# cluster: Name of the cluster. It must correspond to a cluster
# configured in the server plugin.
# cluster = ""

# token_path: Path to the service account token on disk.
# Default: /run/secrets/kubernetes.io/serviceaccount/token.
# token_path = "/run/secrets/kubernetes.io/serviceaccount/token"
Expand Down Expand Up @@ -180,7 +180,7 @@ plugins {
# certificate_path: The path to the certificate bundle on disk. The
# file must contain one or more PEM blocks, starting with the identity
# certificate followed by any intermediate certificates necessary for
# chain-of-trust validation.
# chain-of-trust validation.
# certificate_path = ""

# intermediates_path: Optional. The path to a chain of intermediate
Expand All @@ -204,7 +204,7 @@ plugins {
# docker_version = ""
}
}

# WorkloadAttestor "k8s": A workload attestor which allows selectors based
# on Kubernetes constructs such ns (namespace) and sa (service account).
WorkloadAttestor "k8s" {
Expand Down Expand Up @@ -244,7 +244,7 @@ plugins {
# node_name_env = "MY_NODE_NAME"

# node_name: The name of the node. Overrides the value obtained by
# the environment variable specified by node_name_env.
# the environment variable specified by node_name_env.
# node_name = ""
}
}
Expand Down Expand Up @@ -276,7 +276,7 @@ plugins {
# port = 9988
# }

# DogStatsd = [
# DogStatsd = [
# # List of DogStatsd addresses.
# { address = "localhost:8125" },
# { address = "collector.example.org:1337" },
Expand All @@ -301,7 +301,7 @@ plugins {
# }

# health_checks: If health checking is desired use this section to configure
# and expose an additional server endpoint for such purpose.
# and expose an additional agent endpoint for such purpose.
# health_checks {
# # listener_enabled: Enables health checks endpoint.
# listener_enabled = true
Expand All @@ -312,9 +312,9 @@ plugins {
# # bind_port: HTTP Port number of the health checks endpoint. Default: 80.
# # bind_port = "80"

# # live_path: HTTP resource path for checking server liveness. Default: /live.
# # live_path: HTTP resource path for checking agent liveness. Default: /live.
# # live_path = "/live"

# # ready_path: HTTP resource path for checking server readiness. Default: /ready.
# # ready_path: HTTP resource path for checking agent readiness. Default: /ready.
# # ready_path = "/ready"
# }
2 changes: 1 addition & 1 deletion doc/spire_agent.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ The agent can expose additional endpoint that can be used for health checking. I
health_checks {
listener_enabled = true
bind_address = "localhost"
bind_port = "80"
bind_port = "8080"
live_path = "/live"
ready_path = "/ready"
}
Expand Down
2 changes: 1 addition & 1 deletion doc/spire_server.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ The server can expose an additional endpoint that can be used for health checkin
health_checks {
listener_enabled = true
bind_address = "localhost"
bind_port = "80"
bind_port = "8080"
live_path = "/live"
ready_path = "/ready"
}
Expand Down
27 changes: 24 additions & 3 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@ package agent

import (
"context"
"errors"
"fmt"
"net/http"
_ "net/http/pprof" //nolint: gosec // import registers routes on DefaultServeMux
"os"
"path"
"runtime"
"sync"
"time"

"github.com/spiffe/go-spiffe/v2/spiffeid"
api_workload "github.com/spiffe/spire/api/workload"
admin_api "github.com/spiffe/spire/pkg/agent/api"
node_attestor "github.com/spiffe/spire/pkg/agent/attestor/node"
workload_attestor "github.com/spiffe/spire/pkg/agent/attestor/workload"
Expand All @@ -30,6 +31,8 @@ import (
"github.com/spiffe/spire/proto/spire/api/server/bundle/v1"
_ "golang.org/x/net/trace" // registers handlers on the DefaultServeMux
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

type Agent struct {
Expand Down Expand Up @@ -97,7 +100,7 @@ func (a *Agent) Run(ctx context.Context) error {

endpoints := a.newEndpoints(cat, metrics, manager)

if err := healthChecks.AddCheck("agent", a, time.Minute); err != nil {
if err := healthChecks.AddCheck("agent", a); err != nil {
return fmt.Errorf("failed adding healthcheck: %v", err)
}

Expand Down Expand Up @@ -261,5 +264,23 @@ func (a *Agent) agentSVIDPath() string {

// Status is used as a top-level health check for the Agent.
func (a *Agent) Status() (interface{}, error) {
return nil, nil
client := api_workload.NewX509Client(&api_workload.X509ClientConfig{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same situation as the server applies here also.

Addr: a.c.BindAddress,
FailOnError: true,
})
defer client.Stop()

errCh := make(chan error, 1)
go func() {
errCh <- client.Start()
}()

err := <-errCh
if status.Code(err) == codes.Unavailable {
return nil, errors.New("workload api is unavailable") //nolint: golint // error is (ab)used for CLI output
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that it would be good to expose the error captured in err.

}

return health.Details{
Message: "successfully created a workload api client to fetch x509 svid",
}, nil
Comment on lines +267 to +285
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had something very similar in mind. I think it needs a context.WithTimeout and a select or we risk blocking on the client indefinitely:
https://play.golang.org/p/_R0PeOF3tU9
vs.
https://play.golang.org/p/KOOZuJNTELv

Obviously, we would want to add a test for the race detector on that select as well.

}
31 changes: 18 additions & 13 deletions pkg/common/health/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,21 @@ type Config struct {
UnusedKeys []string `hcl:",unusedKeys"`
}

// getAddress returns an address suitable for use as http.Server.Addr.
func (c *Config) getAddress() string {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving this method above getReadyPath makes review a little more difficult. This is not actually a change to getAddress(), but I have to work to confirm that.

host := "localhost"
if c.BindAddress != "" {
host = c.BindAddress
}

port := "80"
if c.BindPort != "" {
port = c.BindPort
}

return fmt.Sprintf("%s:%s", host, port)
}

// getReadyPath returns the configured value or a default
func (c *Config) getReadyPath() string {
if c.ReadyPath == "" {
Expand All @@ -36,17 +51,7 @@ func (c *Config) getLivePath() string {
return c.LivePath
}

// getAddress returns an address suitable for use as http.Server.Addr.
func (c *Config) getAddress() string {
host := "localhost"
if c.BindAddress != "" {
host = c.BindAddress
}

port := "80"
if c.BindPort != "" {
port = c.BindPort
}

return fmt.Sprintf("%s:%s", host, port)
// Details are additional data to be used when the system is ready
type Details struct {
Message string `json:"message,omitempty"`
}
11 changes: 7 additions & 4 deletions pkg/common/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import (
"github.com/spiffe/spire/pkg/common/telemetry"
)

const readyCheckInterval = time.Minute

// health.Checker is responsible for running health checks and serving the healthcheck HTTP paths
type Checker struct {
config Config
Expand Down Expand Up @@ -46,20 +48,21 @@ func NewChecker(config Config, log logrus.FieldLogger) *Checker {
}
}

hc.StatusListener = &statusListener{}
hc.Logger = &logadapter{FieldLogger: log.WithField(telemetry.SubsystemName, "health")}
l := log.WithField(telemetry.SubsystemName, "health")
hc.StatusListener = &statusListener{log: l}
hc.Logger = &logadapter{FieldLogger: l}

return &Checker{config: config, server: server, hc: hc, log: log}
}

func (c *Checker) AddCheck(name string, checker health.ICheckable, interval time.Duration) error {
func (c *Checker) AddCheck(name string, checker health.ICheckable) error {
c.mutex.Lock()
defer c.mutex.Unlock()

return c.hc.AddCheck(&health.Config{
Name: name,
Checker: checker,
Interval: interval,
Interval: readyCheckInterval,
Fatal: true,
})
}
Expand Down
26 changes: 23 additions & 3 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@ package server

import (
"context"
"errors"
"fmt"
"net/http"
_ "net/http/pprof" //nolint: gosec // import registers routes on DefaultServeMux
"net/url"
"os"
"runtime"
"sync"
"time"

"github.com/andres-erbsen/clock"
server_util "github.com/spiffe/spire/cmd/spire-server/util"
"github.com/spiffe/spire/pkg/common/health"
"github.com/spiffe/spire/pkg/common/hostservices/metricsservice"
common_services "github.com/spiffe/spire/pkg/common/plugin/hostservices"
Expand All @@ -29,6 +30,7 @@ import (
"github.com/spiffe/spire/pkg/server/plugin/hostservices"
"github.com/spiffe/spire/pkg/server/registration"
"github.com/spiffe/spire/pkg/server/svid"
"github.com/spiffe/spire/proto/spire/api/server/bundle/v1"
"google.golang.org/grpc"
)

Expand Down Expand Up @@ -158,7 +160,7 @@ func (s *Server) run(ctx context.Context) (err error) {

registrationManager := s.newRegistrationManager(cat, metrics)

if err := healthChecks.AddCheck("server", s, time.Minute); err != nil {
if err := healthChecks.AddCheck("server", s); err != nil {
return fmt.Errorf("failed adding healthcheck: %v", err)
}

Expand Down Expand Up @@ -384,5 +386,23 @@ func (s *Server) validateTrustDomain(ctx context.Context, ds datastore.DataStore

// Status is used as a top-level health check for the Server.
func (s *Server) Status() (interface{}, error) {
return nil, nil
client, err := server_util.NewServerClient(s.config.BindUDSAddress.Name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm realizing that there could be a race here at startup time, where this can be executed before the server is serving. In that case, throwing this error could be confusing.

if err != nil {
return nil, errors.New("cannot create registration client")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that it would be good to expose the error captured in err.

}
Copy link
Member

@azdagron azdagron Jan 12, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to defer a call to Release() on the client or we will leak a gRPC connection.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fixed it with 4a3c745.

defer client.Release()

bundleClient := client.NewBundleClient()

// Currently using the ability to fetch a bundle as the health check. This
// **could** be problematic if the Upstream CA signing process is lengthy.
// As currently coded however, the API isn't served until after
// the server CA has been signed by upstream.
if _, err := bundleClient.GetBundle(context.Background(), &bundle.GetBundleRequest{}); err != nil {
return nil, errors.New("unable to fetch bundle")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that it would be good to expose the error captured in err.

}

return health.Details{
Message: "successfully fetched bundle",
}, nil
}
28 changes: 19 additions & 9 deletions test/integration/suites/k8s-reconcile/conf/agent/spire-agent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ data:
}
}

health_checks {
listener_enabled = true
bind_address = "0.0.0.0"
bind_port = "8080"
live_path = "/live"
ready_path = "/ready"
}

---

apiVersion: apps/v1
Expand Down Expand Up @@ -129,13 +137,15 @@ spec:
- name: spire-token
mountPath: /var/run/secrets/tokens
livenessProbe:
exec:
command: ["/opt/spire/bin/spire-agent", "healthcheck", "-socketPath", "/run/spire/sockets/agent.sock"]
httpGet:
path: /live
port: 8080
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
exec:
command: ["/opt/spire/bin/spire-agent", "healthcheck", "-socketPath", "/run/spire/sockets/agent.sock", "--shallow"]
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 10
periodSeconds: 10
volumes:
Expand All @@ -151,8 +161,8 @@ spec:
type: DirectoryOrCreate
- name: spire-token
projected:
sources:
- serviceAccountToken:
path: spire-agent
expirationSeconds: 7200
audience: spire-server
sources:
- serviceAccountToken:
path: spire-agent
expirationSeconds: 7200
audience: spire-server
Loading