PKI Health Check Command (#17750)

* Stub out initial health check command This command will be used to generate health check results for the PKI engine. Signed-off-by: Alexander Scheel <[email protected]> * Start common health check implementation Signed-off-by: Alexander Scheel <[email protected]> * Add common health check utilities These utilities will collect helpers not specific to PKI health checks, such as formatting longer durations more legibly. Signed-off-by: Alexander Scheel <[email protected]> * Add PKI health check common utils Many health checks will need issuer and/or CRL information in order to execute. We've centrally located these helpers to avoid particular health checks from needing to reimplement them each time. Signed-off-by: Alexander Scheel <[email protected]> * Adding ca_validity_period health check Signed-off-by: Alexander Scheel <[email protected]> * Begin using health-checks in PKI command Signed-off-by: Alexander Scheel <[email protected]> * Allow parsing raw requests afterwards This shifts the last of the logic difference between Read(...) and ReadRaw(...) to a new helper, allowing ReadRaw(...) requests to be parsed into the same response structure afterwards as Read(...); this allows API callers to fetch the raw secret and inspect the raw response object in case something went wrong (error code &c) -- and when the request succeeds, they can still get the api.Secret out. This will be used with the PKI health check functionality, making both LIST and READ operations use ReadRaw, and optionally parsing the secret afterwards. Signed-off-by: Alexander Scheel <[email protected]> * Add crl_validity_period health check Signed-off-by: Alexander Scheel <[email protected]> * Add tests for PKI health check Signed-off-by: Alexander Scheel <[email protected]> * Fix bug in raw reading with contexts When reading raw objects, don't manually call the context cancellation: this causes timeouts and/or EOF errors when attempting to read or parse the response body. See message in client.RawRequestWithContext(...) for more information. This was causing the test suite to randomly fail, due to the context cancelling. The test suite's client usually had a default timeout, whereas the CLI didn't, and thus didn't exhibit the same issue. Signed-off-by: Alexander Scheel <[email protected]> * Add changelog Signed-off-by: Alexander Scheel <[email protected]> * Fix typo in permissions message Signed-off-by: Alexander Scheel <[email protected]> * Move %v->%w for errs Signed-off-by: Alexander Scheel <[email protected]> Signed-off-by: Alexander Scheel <[email protected]>
hashicorp · Nov 16, 2022 · 02d265b · 02d265b
1 parent 5cfb477
commit 02d265b
Show file tree

Hide file tree

Showing 10 changed files with 1,273 additions and 15 deletions.
diff --git a/api/logical.go b/api/logical.go
@@ -66,6 +66,27 @@ func (c *Logical) ReadWithDataWithContext(ctx context.Context, path string, data
 	defer cancelFunc()
 
 	resp, err := c.readRawWithDataWithContext(ctx, path, data)
+	return c.ParseRawResponseAndCloseBody(resp, err)
+}
+
+func (c *Logical) ReadRaw(path string) (*Response, error) {
+	return c.ReadRawWithData(path, nil)
+}
+
+func (c *Logical) ReadRawWithData(path string, data map[string][]string) (*Response, error) {
+	return c.ReadRawWithDataWithContext(context.Background(), path, data)
+}
+
+func (c *Logical) ReadRawWithDataWithContext(ctx context.Context, path string, data map[string][]string) (*Response, error) {
+	// See note in client.go, RawRequestWithContext for why we do not call
+	// Cancel here. The difference between these two methods are that the
+	// former takes a Request object directly, whereas this builds one
+	// up for the caller.
+	ctx, _ = c.c.withConfiguredTimeout(ctx)
+	return c.readRawWithDataWithContext(ctx, path, data)
+}
+
+func (c *Logical) ParseRawResponseAndCloseBody(resp *Response, err error) (*Secret, error) {
 	if resp != nil {
 		defer resp.Body.Close()
 	}
@@ -90,21 +111,6 @@ func (c *Logical) ReadWithDataWithContext(ctx context.Context, path string, data
 	return ParseSecret(resp.Body)
 }
 
-func (c *Logical) ReadRaw(path string) (*Response, error) {
-	return c.ReadRawWithData(path, nil)
-}
-
-func (c *Logical) ReadRawWithData(path string, data map[string][]string) (*Response, error) {
-	return c.ReadRawWithDataWithContext(context.Background(), path, data)
-}
-
-func (c *Logical) ReadRawWithDataWithContext(ctx context.Context, path string, data map[string][]string) (*Response, error) {
-	ctx, cancelFunc := c.c.withConfiguredTimeout(ctx)
-	defer cancelFunc()
-
-	return c.readRawWithDataWithContext(ctx, path, data)
-}
-
 func (c *Logical) readRawWithDataWithContext(ctx context.Context, path string, data map[string][]string) (*Response, error) {
 	r := c.c.NewRequest(http.MethodGet, "/v1/"+path)
 

diff --git a/changelog/17750.txt b/changelog/17750.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+cli/pki: Add health-check subcommand to evaluate the health of a PKI instance.
+```
diff --git a/command/commands.go b/command/commands.go
@@ -787,6 +787,11 @@ func initCommands(ui, serverCmdUi cli.Ui, runOpts *RunOptions) {
 				ShutdownCh:  MakeShutdownCh(),
 			}, nil
 		},
+		"pki health-check": func() (cli.Command, error) {
+			return &PKIHealthCheckCommand{
+				BaseCommand: getBaseCommand(),
+			}, nil
+		},
 	}
 
 	// Disabled by default until functional

diff --git a/command/healthcheck/common.go b/command/healthcheck/common.go
@@ -0,0 +1,276 @@
+/*
+ * The healthcheck package attempts to allow generic checks of arbitrary
+ * engines, while providing a common framework with some performance
+ * efficiencies in mind.
+ *
+ * The core of this package is the Executor context; a caller would
+ * provision a set of checks, an API client, and a configuration,
+ * which the executor would use to decide which checks to execute
+ * and how.
+ *
+ * Checks are based around a series of remote paths that are fetched by
+ * the client; these are broken into two categories: static paths, which
+ * can always be fetched; and dynamic paths, which the check fetches based
+ * on earlier results.
+ *
+ * For instance, a basic PKI CA lifetime check will have static fetch against
+ * the list of CAs, and a dynamic fetch, using that earlier list, to fetch the
+ * PEMs of all CAs.
+ *
+ * This allows health checks to share data: many PKI checks will need the
+ * issuer list and so repeatedly fetching this may result in a performance
+ * impact.
+ */
+
+package healthcheck
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/hashicorp/vault/api"
+	"github.com/hashicorp/vault/sdk/logical"
+)
+
+type Executor struct {
+	Client         *api.Client
+	Mount          string
+	DefaultEnabled bool
+
+	Config map[string]map[string]interface{}
+
+	Resources map[string]map[logical.Operation]*PathFetch
+
+	Checkers []Check
+}
+
+func NewExecutor(client *api.Client, mount string) *Executor {
+	return &Executor{
+		Client:         client,
+		DefaultEnabled: true,
+		Mount:          mount,
+		Config:         make(map[string]map[string]interface{}),
+		Resources:      make(map[string]map[logical.Operation]*PathFetch),
+	}
+}
+
+func (e *Executor) AddCheck(c Check) {
+	e.Checkers = append(e.Checkers, c)
+}
+
+func (e *Executor) BuildConfig(external map[string]interface{}) error {
+	merged := e.Config
+
+	for index, checker := range e.Checkers {
+		name := checker.Name()
+		if _, present := merged[name]; name == "" || present {
+			return fmt.Errorf("bad checker %v: name is empty or already present: %v", index, name)
+		}
+
+		// Fetch the default configuration; if the check returns enabled
+		// status, verify it matches our expectations (in the event it should
+		// be disabled by default), otherwise, add it in.
+		config := checker.DefaultConfig()
+		enabled, present := config["enabled"]
+		if !present {
+			config["enabled"] = e.DefaultEnabled
+		} else if enabled.(bool) && !e.DefaultEnabled {
+			config["enabled"] = e.DefaultEnabled
+		}
+
+		// Now apply any external config for this check.
+		if econfig, present := external[name]; present {
+			for param, evalue := range econfig.(map[string]interface{}) {
+				if _, ok := config[param]; !ok {
+					// Assumption: default configs have all possible
+					// configuration options. This external config has
+					// an unknown option, so we want to error out.
+					return fmt.Errorf("unknown configuration option for %v: %v", name, param)
+				}
+
+				config[param] = evalue
+			}
+		}
+
+		// Now apply it and save it.
+		if err := checker.LoadConfig(config); err != nil {
+			return fmt.Errorf("error saving merged config for %v: %w", name, err)
+		}
+		merged[name] = config
+	}
+
+	return nil
+}
+
+func (e *Executor) Execute() (map[string][]*Result, error) {
+	ret := make(map[string][]*Result)
+	for _, checker := range e.Checkers {
+		if !checker.IsEnabled() {
+			continue
+		}
+
+		if err := checker.FetchResources(e); err != nil {
+			return nil, err
+		}
+
+		results, err := checker.Evaluate(e)
+		if err != nil {
+			return nil, err
+		}
+
+		for _, result := range results {
+			result.Endpoint = e.templatePath(result.Endpoint)
+			result.StatusDisplay = ResultStatusNameMap[result.Status]
+		}
+
+		ret[checker.Name()] = results
+	}
+
+	return ret, nil
+}
+
+func (e *Executor) templatePath(path string) string {
+	return strings.ReplaceAll(path, "{{mount}}", e.Mount)
+}
+
+func (e *Executor) FetchIfNotFetched(op logical.Operation, rawPath string) (*PathFetch, error) {
+	path := e.templatePath(rawPath)
+
+	byOp, present := e.Resources[path]
+	if present && byOp != nil {
+		result, present := byOp[op]
+		if present && result != nil {
+			return result, result.FetchSurfaceError()
+		}
+	}
+
+	// Must not exist in cache; create it.
+	if byOp == nil {
+		e.Resources[path] = make(map[logical.Operation]*PathFetch)
+	}
+
+	ret := &PathFetch{
+		Operation:   op,
+		Path:        path,
+		ParsedCache: make(map[string]interface{}),
+	}
+
+	data := map[string][]string{}
+	if op == logical.ListOperation {
+		data["list"] = []string{"true"}
+	} else if op != logical.ReadOperation {
+		return nil, fmt.Errorf("unknown operation: %v on %v", op, path)
+	}
+
+	response, err := e.Client.Logical().ReadRawWithData(path, data)
+	ret.Response = response
+	if err != nil {
+		ret.FetchError = err
+	} else {
+		// Not all secrets will parse correctly. Sometimes we really want
+		// to fetch a raw endpoint, sometimes we're run with a bad mount
+		// or missing permissions.
+		secret, secretErr := e.Client.Logical().ParseRawResponseAndCloseBody(response, err)
+		if secretErr != nil {
+			ret.SecretParseError = secretErr
+		} else {
+			ret.Secret = secret
+		}
+	}
+
+	e.Resources[path][op] = ret
+	return ret, ret.FetchSurfaceError()
+}
+
+type PathFetch struct {
+	Operation        logical.Operation
+	Path             string
+	Response         *api.Response
+	FetchError       error
+	Secret           *api.Secret
+	SecretParseError error
+	ParsedCache      map[string]interface{}
+}
+
+func (p *PathFetch) IsOK() bool {
+	return p.FetchError == nil && p.Response != nil
+}
+
+func (p *PathFetch) IsSecretOK() bool {
+	return p.IsOK() && p.SecretParseError == nil && p.Secret != nil
+}
+
+func (p *PathFetch) FetchSurfaceError() error {
+	if p.IsOK() || p.IsSecretPermissionsError() || p.IsUnsupportedPathError() || p.IsMissingResource() {
+		return nil
+	}
+
+	if strings.Contains(p.FetchError.Error(), "route entry not found") {
+		return fmt.Errorf("Error making API request: was a bad mount given?\n\nOperation: %v\nPath: %v\nOriginal Error:\n%w", p.Operation, p.Path, p.FetchError)
+	}
+
+	return p.FetchError
+}
+
+func (p *PathFetch) IsSecretPermissionsError() bool {
+	return !p.IsOK() && strings.Contains(p.FetchError.Error(), "permission denied")
+}
+
+func (p *PathFetch) IsUnsupportedPathError() bool {
+	return !p.IsOK() && strings.Contains(p.FetchError.Error(), "unsupported path")
+}
+
+func (p *PathFetch) IsMissingResource() bool {
+	return !p.IsOK() && strings.Contains(p.FetchError.Error(), "unable to find")
+}
+
+type Check interface {
+	Name() string
+	IsEnabled() bool
+
+	DefaultConfig() map[string]interface{}
+	LoadConfig(config map[string]interface{}) error
+
+	FetchResources(e *Executor) error
+
+	Evaluate(e *Executor) ([]*Result, error)
+}
+
+type ResultStatus int
+
+const (
+	ResultNotApplicable ResultStatus = iota
+	ResultOK
+	ResultInformational
+	ResultWarning
+	ResultCritical
+	ResultInvalidVersion
+	ResultInsufficientPermissions
+)
+
+var ResultStatusNameMap = map[ResultStatus]string{
+	ResultNotApplicable:           "not_applicable",
+	ResultOK:                      "ok",
+	ResultInformational:           "informational",
+	ResultWarning:                 "warning",
+	ResultCritical:                "critical",
+	ResultInvalidVersion:          "invalid_version",
+	ResultInsufficientPermissions: "insufficient_permissions",
+}
+
+var NameResultStatusMap = map[string]ResultStatus{
+	"not_applicable":           ResultNotApplicable,
+	"ok":                       ResultOK,
+	"informational":            ResultInformational,
+	"warning":                  ResultWarning,
+	"critical":                 ResultCritical,
+	"invalid_version":          ResultInvalidVersion,
+	"insufficient_permissions": ResultInsufficientPermissions,
+}
+
+type Result struct {
+	Status        ResultStatus `json:"status_code"`
+	StatusDisplay string       `json:"status"`
+	Endpoint      string       `json:"endpoint,omitempty"`
+	Message       string       `json:"message,omitempty"`
+}