diff --git a/docs/metrics.md b/docs/metrics.md index 5b568f9251..30b7522cd9 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -126,6 +126,7 @@ A list of component-level metrics emitted by Diego. Contributors interested in a | `GardenContainerDestructionFailedDuration` | Time the rep's Garden backend took to destroy a container. Emitted after every failed container destruction. | ns | | `GardenContainerDestructionSucceededDuration` | Time the rep's Garden backend took to destroy a container. Emitted after every successful container destruction. | ns | | `GardenHealthCheckFailed` | Whether the cell has failed to pass its healthcheck against the garden backend. 0 signifies healthy, and 1 signifies unhealthy. Emitted periodically. | 0 or 1 (boolean) | +| `HTTPLivenessChecksFailedCount` | Count of failed HTTP Liveness Checks | number | | `RepBulkSyncDuration` | Time the cell rep took to synchronize the ActualLRPs it has claimed with its actual garden containers. Emitted periodically by each rep. | ns | | `RequestsStarted` | Cumulative number of requests of a particular type that have been made. Currently tracking `CancelTask`, `ContainerMetrics`, `Perform`, `Reset`, `State`, and `StopLRPInstance` requests. Emitted every 60 seconds. | number | | `RequestsSucceeded` | Cumulative number of requests of a particular type that have completed successfully. Currently tracking `CancelTask`, `ContainerMetrics`, `Perform`, `Reset`, `State`, and `StopLRPInstance` requests. Emitted every 60 seconds. | number | @@ -135,6 +136,7 @@ A list of component-level metrics emitted by Diego. Contributors interested in a | `StalledGardenDuration` | Time the rep is waiting on its garden backend to become healthy during startup. Emitted only if garden not responsive when the rep starts up. | ns | | `StartingContainerCount` | Number of containers currently in a Reserved, Initializing, or Created state. Emitted periodically. | number | | `StrandedEvacuatingActualLRPs` | Evacuating ActualLPRs that timed out during the evacuation process. Emitted when evacuation doesn't complete successful. | number | +| `TCPLivenessChecksFailedCount` | Count of failed TCP Liveness Checks | number | | `VolmanMountDuration` | Time volman took to mount a volume. Emitted by each rep when volumes are mounted. | ns | | `VolmanMountDurationFor` | Time volman took to mount a volume with a specific volume driver. Emitted by each rep when volumes are mounted. | ns | | `VolmanMountErrors` | Count of failed volume mounts. Emitted periodically by each rep. | number | diff --git a/jobs/rep/spec b/jobs/rep/spec index 405f90f465..0737d9d1a3 100644 --- a/jobs/rep/spec +++ b/jobs/rep/spec @@ -235,7 +235,9 @@ properties: enable_declarative_healthcheck: description: "When set, enables the rep to prefer the LRP CheckDefinition to healthcheck instances over the Monitor action. Requires Garden-Runc v1.10.0+" default: false - + enable_healthcheck_metrics: + description: "When set, enables the rep to emit healtcheck failure metrics. Requires enable_declarative_healthcheck to be set to true." + default: false cell_registrations.locket.enabled: description: Enable the cell rep to register itself as a service with Locket. default: true diff --git a/jobs/rep/templates/rep.json.erb b/jobs/rep/templates/rep.json.erb index 733aca8af1..1cf981d081 100644 --- a/jobs/rep/templates/rep.json.erb +++ b/jobs/rep/templates/rep.json.erb @@ -58,6 +58,7 @@ disk_mb: p("diego.executor.disk_capacity_mb").to_s, enable_declarative_healthcheck: p("enable_declarative_healthcheck"), declarative_healthcheck_path: "/var/vcap/packages/healthcheck", + enable_healthcheck_metrics: p("enable_healthcheck_metrics"), enable_container_proxy: p("containers.proxy.enabled"), container_proxy_require_and_verify_client_certs: p("containers.proxy.require_and_verify_client_certificates"), container_proxy_trusted_ca_certs: p("containers.proxy.trusted_ca_certificates"), diff --git a/jobs/rep_windows/spec b/jobs/rep_windows/spec index 0521d10619..4a7dfefc6f 100644 --- a/jobs/rep_windows/spec +++ b/jobs/rep_windows/spec @@ -237,13 +237,16 @@ properties: diego.rep.locket.client_keepalive_timeout: description: "Timeout in seconds to receive a response to the keepalive ping. If a response is not received within this time, the locket client will reconnect to another server." default: 22 - + enable_declarative_healthcheck: description: "When set, enables the rep to prefer the LRP CheckDefinition to healthcheck instances over the Monitor action." default: false declarative_healthcheck_path: description: "The directory containing the declarative healthcheck binary" default: "/var/vcap/packages/healthcheck_windows/external" + enable_healthcheck_metrics: + description: "When set, enables the rep to emit healtcheck failure metrics. Requires enable_declarative_healthcheck to be set to true." + default: false cell_registrations.locket.enabled: description: Enable the cell rep to register itself as a service with Locket. diff --git a/jobs/rep_windows/templates/rep.json.erb b/jobs/rep_windows/templates/rep.json.erb index 4bd8c4b397..825fe6e9e1 100644 --- a/jobs/rep_windows/templates/rep.json.erb +++ b/jobs/rep_windows/templates/rep.json.erb @@ -58,6 +58,7 @@ disk_mb: p("diego.executor.disk_capacity_mb").to_s, enable_declarative_healthcheck: p("enable_declarative_healthcheck"), declarative_healthcheck_path: p("declarative_healthcheck_path"), + enable_healthcheck_metrics: p("enable_healthcheck_metrics"), enable_container_proxy: p("containers.proxy.enabled"), container_proxy_require_and_verify_client_certs: p("containers.proxy.require_and_verify_client_certificates"), container_proxy_trusted_ca_certs: p("containers.proxy.trusted_ca_certificates"),