Skip to content

Commit

Permalink
agent: Remove vm-informant support (#506)
Browse files Browse the repository at this point in the history
Removes all vm-informant related code from the autoscaler-agent.
Also renames all metrics that previously included 'informant' in the
name to 'monitor' instead.

Co-authored-by: Em Sharnoff <[email protected]>
  • Loading branch information
fprasx and sharnoff authored Sep 28, 2023
1 parent 967a93a commit af638e2
Show file tree
Hide file tree
Showing 15 changed files with 488 additions and 2,276 deletions.
34 changes: 17 additions & 17 deletions ARCHITECTURE-network-diagram.org
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,29 @@ awk '/#\+BEGIN_SRC/{flag=1;next}/#\+END_SRC/{flag=0}flag' ARCHITECTURE-network-d
| +---------| autoscaler agent |
| | |
| | (one per K8s node) |
| +-----------------*--+
| | | ^ 10302
| | | |
| | | |
+=================|==================================|===========|===|======+
: K8s pod | | | | :
: QMP | | | | :
: 20183 V | | | :
: +---------------*----------------------------------|-----------|---|---+ :
: | | | | | :
: | QEMU process | | | | :
: | | | | | :
: | | | | | :
: | compute_ctl postgres metrics | informant | | | :
: | mgmt API postgres prometheus | informant | | | :
: | 3080 5432 9100 V 10301 V | | :
| +--------------------+
| | |
| | |
| | |
+=================|==================================|===========|==========+
: K8s pod | | | :
: QMP | | | :
: 20183 V | | :
: +---------------*----------------------------------|-----------|-------+ :
: | | | | :
: | QEMU process | | | :
: | | | | :
: | | | | :
: | compute_ctl postgres metrics | monitor | | :
: | mgmt API postgres prometheus | websocket | | :
: | 3080 5432 9100 V 10301 V | :
: +------------------------*-----------*-------------*-----------*-------+ :
: | VM | :
: | | :
: | Inside the VM runs: | :
: | - compute_ctl (listens on port 3080) | :
: | - VM monitor (port 10301 via websocket) | :
: | - Postgres (port 5432) | :
: | - VM informant (port 10301) | :
: | - vector (metrics on port 9100) | :
: | | :
: +----------------------------------------------------------------------+ :
Expand Down
Binary file modified ARCHITECTURE-network-diagram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 4 additions & 4 deletions ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ A third component, a binary running inside of the VM to (a) handle being upscale
The scheduler plugin is responsible for handling resource requests from the `autoscaler-agent`,
capping increases so that node resources aren't overcommitted.

The `autoscaler-agent` periodically reads from a metrics source in the VM (defined by the
_informant_) and makes scaling decisions about the _desired_ resource allocation. It then
requests these resources from the scheduler plugin, and submits a patch request for its NeonVM to
update the resources.
The `autoscaler-agent` periodically reads from a metrics source in the VM (currently vector's
`node_exporter`-like functionality) and makes scaling decisions about the _desired_ resource
allocation. It then requests these resources from the scheduler plugin, and submits a patch request
for its NeonVM to update the resources.

The VM monitor is responsible for handling all of the resource management functionality inside
the VM that the `autoscaler-agent` cannot. This constitutes handling upscales (eg. increasing Postgres
Expand Down
20 changes: 7 additions & 13 deletions deploy/agent/config_map.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,17 @@ data:
"memoryUsageFractionTarget": 0.75
}
},
"informant": {
"serverPort": 10301,
"callbackPort": 10302,
"retryServerMinWaitSeconds": 5,
"retryServerNormalWaitSeconds": 5,
"registerRetrySeconds": 5,
"requestTimeoutSeconds": 1,
"registerTimeoutSeconds": 2,
"downscaleTimeoutSeconds": 2,
"unhealthyAfterSilenceDurationSeconds": 20,
"unhealthyStartupGracePeriodSeconds": 20
},
"monitor": {
"serverPort": 10301,
"responseTimeoutSeconds": 5,
"connectionTimeoutSeconds": 4
"connectionTimeoutSeconds": 4,
"connectionRetryMinWaitSeconds": 5,
"unhealthyAfterSilenceDurationSeconds": 20,
"unhealthyStartupGracePeriodSeconds": 20,
"maxHealthCheckSequentialFailuresSeconds": 30
},
"metrics": {
"port": 9100,
"loadMetricPrefix": "host_",
"requestTimeoutSeconds": 2,
"secondsBetweenRequests": 5
Expand Down
76 changes: 23 additions & 53 deletions pkg/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import (

type Config struct {
Scaling ScalingConfig `json:"scaling"`
Informant InformantConfig `json:"informant"`
Metrics MetricsConfig `json:"metrics"`
Scheduler SchedulerConfig `json:"scheduler"`
Monitor MonitorConfig `json:"monitor"`
Expand All @@ -26,6 +25,21 @@ type MonitorConfig struct {
// ConnectionTimeoutSeconds gives how long we may take to connect to the
// monitor before cancelling.
ConnectionTimeoutSeconds uint `json:"connectionTimeoutSeconds"`
// ConnectionRetryMinWaitSeconds gives the minimum amount of time we must wait between attempts
// to connect to the vm-monitor, regardless of whether they're successful.
ConnectionRetryMinWaitSeconds uint `json:"connectionRetryMinWaitSeconds"`
// ServerPort is the port that the dispatcher serves from
ServerPort uint16 `json:"serverPort"`
// UnhealthyAfterSilenceDurationSeconds gives the duration, in seconds, after which failing to
// receive a successful request from the monitor indicates that it is probably unhealthy.
UnhealthyAfterSilenceDurationSeconds uint `json:"unhealthyAfterSilenceDurationSeconds"`
// UnhealthyStartupGracePeriodSeconds gives the duration, in seconds, after which we will no
// longer excuse total VM monitor failures - i.e. when unhealthyAfterSilenceDurationSeconds
// kicks in.
UnhealthyStartupGracePeriodSeconds uint `json:"unhealthyStartupGracePeriodSeconds"`
// MaxHealthCheckSequentialFailuresSeconds gives the duration, in seconds, after which we
// should restart the connection to the vm-monitor if health checks aren't succeeding.
MaxHealthCheckSequentialFailuresSeconds uint `json:"maxHealthCheckSequentialFailuresSeconds"`
}

// DumpStateConfig configures the endpoint to dump all internal state
Expand All @@ -46,50 +60,10 @@ type ScalingConfig struct {
DefaultConfig api.ScalingConfig `json:"defaultConfig"`
}

type InformantConfig struct {
// ServerPort is the port that the VM informant serves from
ServerPort uint16 `json:"serverPort"`

// CallbackPort is the port that the agent listens on for informant -> agent requests
CallbackPort int `json:"callbackPort"`

// RetryServerMinWaitSeconds gives the minimum duration, in seconds, that we must wait between the
// start of one InformantServer and the next
//
// This "minimum wait" is only used when thethe
RetryServerMinWaitSeconds uint `json:"retryServerMinWaitSeconds"`
// RetryServerNormalWaitSeconds gives the typical duration, in seconds, that we wait between an
// InformantServer failing and our retry.
RetryServerNormalWaitSeconds uint `json:"retryServerNormalWaitSeconds"`
// RegisterRetrySeconds gives the duration, in seconds, to wait between retrying a failed
// register request.
RegisterRetrySeconds uint `json:"registerRetrySeconds"`

// RequestTimeoutSeconds gives the timeout for any individual request to the informant, except
// for those with separately-defined values below.
RequestTimeoutSeconds uint `json:"requestTimeoutSeconds"`
// RegisterTimeoutSeconds gives the timeout duration, in seconds, for a register request.
//
// This is a separate field from RequestTimeoutSeconds because registering may require that the
// informant suspend a previous agent, which could take longer.
RegisterTimeoutSeconds uint `json:"registerTimeoutSeconds"`
// DownscaleTimeoutSeconds gives the timeout duration, in seconds, for a downscale request.
//
// This is a separate field from RequestTimeoutSeconds it's possible that downscaling may
// require some non-trivial work that we want to allow to complete.
DownscaleTimeoutSeconds uint `json:"downscaleTimeoutSeconds"`

// UnhealthyAfterSilenceDurationSeconds gives the duration, in seconds, after which failing to
// receive a successful request from the informant indicates that it is probably unhealthy.
UnhealthyAfterSilenceDurationSeconds uint `json:"unhealthyAfterSilenceDurationSeconds"`
// UnhealthyStartupGracePeriodSeconds gives the duration, in seconds, after which we will no
// longer excuse total VM informant failures - i.e. when unhealthyAfterSilenceDurationSeconds
// kicks in.
UnhealthyStartupGracePeriodSeconds uint `json:"unhealthyStartupGracePeriodSeconds"`
}

// MetricsConfig defines a few parameters for metrics requests to the VM
type MetricsConfig struct {
// Port is the port that VMs are expected to provide metrics on
Port uint16 `json:"port"`
// LoadMetricPrefix is the prefix at the beginning of the load metrics that we use. For
// node_exporter, this is "node_", and for vector it's "host_"
LoadMetricPrefix string `json:"loadMetricPrefix"`
Expand Down Expand Up @@ -152,21 +126,17 @@ func (c *Config) validate() error {
erc.Whenf(ec, c.Billing != nil && c.Billing.URL == "", emptyTmpl, ".billing.url")
erc.Whenf(ec, c.DumpState != nil && c.DumpState.Port == 0, zeroTmpl, ".dumpState.port")
erc.Whenf(ec, c.DumpState != nil && c.DumpState.TimeoutSeconds == 0, zeroTmpl, ".dumpState.timeoutSeconds")
erc.Whenf(ec, c.Informant.DownscaleTimeoutSeconds == 0, zeroTmpl, ".informant.downscaleTimeoutSeconds")
erc.Whenf(ec, c.Informant.RegisterRetrySeconds == 0, zeroTmpl, ".informant.registerRetrySeconds")
erc.Whenf(ec, c.Informant.RegisterTimeoutSeconds == 0, zeroTmpl, ".informant.registerTimeoutSeconds")
erc.Whenf(ec, c.Informant.RequestTimeoutSeconds == 0, zeroTmpl, ".informant.requestTimeoutSeconds")
erc.Whenf(ec, c.Informant.RetryServerMinWaitSeconds == 0, zeroTmpl, ".informant.retryServerMinWaitSeconds")
erc.Whenf(ec, c.Informant.RetryServerNormalWaitSeconds == 0, zeroTmpl, ".informant.retryServerNormalWaitSeconds")
erc.Whenf(ec, c.Informant.ServerPort == 0, zeroTmpl, ".informant.serverPort")
erc.Whenf(ec, c.Informant.CallbackPort == 0, zeroTmpl, ".informant.callbackPort")
erc.Whenf(ec, c.Informant.UnhealthyAfterSilenceDurationSeconds == 0, zeroTmpl, ".informant.unhealthyAfterSilenceDurationSeconds")
erc.Whenf(ec, c.Informant.UnhealthyStartupGracePeriodSeconds == 0, zeroTmpl, ".informant.unhealthyStartupGracePeriodSeconds")
erc.Whenf(ec, c.Metrics.Port == 0, zeroTmpl, ".metrics.port")
erc.Whenf(ec, c.Metrics.LoadMetricPrefix == "", emptyTmpl, ".metrics.loadMetricPrefix")
erc.Whenf(ec, c.Metrics.SecondsBetweenRequests == 0, zeroTmpl, ".metrics.secondsBetweenRequests")
erc.Whenf(ec, c.Scaling.RequestTimeoutSeconds == 0, zeroTmpl, ".scaling.requestTimeoutSeconds")
erc.Whenf(ec, c.Monitor.ResponseTimeoutSeconds == 0, zeroTmpl, ".monitor.responseTimeoutSeconds")
erc.Whenf(ec, c.Monitor.ConnectionTimeoutSeconds == 0, zeroTmpl, ".monitor.connectionTimeoutSeconds")
erc.Whenf(ec, c.Monitor.ConnectionRetryMinWaitSeconds == 0, zeroTmpl, ".monitor.connectionRetryMinWaitSeconds")
erc.Whenf(ec, c.Monitor.ServerPort == 0, zeroTmpl, ".monitor.serverPort")
erc.Whenf(ec, c.Monitor.UnhealthyAfterSilenceDurationSeconds == 0, zeroTmpl, ".monitor.unhealthyAfterSilenceDurationSeconds")
erc.Whenf(ec, c.Monitor.UnhealthyStartupGracePeriodSeconds == 0, zeroTmpl, ".monitor.unhealthyStartupGracePeriodSeconds")
erc.Whenf(ec, c.Monitor.MaxHealthCheckSequentialFailuresSeconds == 0, zeroTmpl, ".monitor.maxHealthCheckSequentialFailuresSeconds")
// add all errors if there are any: https://github.com/neondatabase/autoscaling/pull/195#discussion_r1170893494
ec.Add(c.Scaling.DefaultConfig.Validate())
erc.Whenf(ec, c.Scheduler.RequestPort == 0, zeroTmpl, ".scheduler.requestPort")
Expand Down
Loading

0 comments on commit af638e2

Please sign in to comment.