Skip to content

Commit

Permalink
feat: add node collector resource settings to odigos config (#1979)
Browse files Browse the repository at this point in the history
This allows to configure the node collector resource settings from
odigos config, and add the memory values to odigos_config

---------

Co-authored-by: Ben Elferink <[email protected]>
  • Loading branch information
blumamir and BenElferink authored Dec 13, 2024
1 parent c05d0dc commit 0134e62
Show file tree
Hide file tree
Showing 8 changed files with 196 additions and 17 deletions.
31 changes: 31 additions & 0 deletions cli/cmd/resources/odigosconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,12 @@ func (a *odigosConfigResourceManager) InstallFromScratch(ctx context.Context) er

sizingProfile := k8sprofiles.FilterSizeProfiles(a.config.Profiles)
collectorGatewayConfig := GetGatewayConfigBasedOnSize(sizingProfile)
collectorNodeConfig := GetNodeCollectorConfigBasedOnSize(sizingProfile)
a.config.CollectorGateway = collectorGatewayConfig
if a.config.CollectorNode != nil {
collectorNodeConfig.CollectorOwnMetricsPort = a.config.CollectorNode.CollectorOwnMetricsPort
}
a.config.CollectorNode = collectorNodeConfig

obj, err := NewOdigosConfiguration(a.ns, a.config)
if err != nil {
Expand All @@ -63,6 +68,32 @@ func (a *odigosConfigResourceManager) InstallFromScratch(ctx context.Context) er
return a.client.ApplyResources(ctx, a.config.ConfigVersion, resources)
}

func GetNodeCollectorConfigBasedOnSize(profile common.ProfileName) *common.CollectorNodeConfiguration {
aggregateProfiles := append([]common.ProfileName{profile}, k8sprofiles.ProfilesMap[profile].Dependencies...)

for _, profile := range aggregateProfiles {
switch profile {
case k8sprofiles.SizeSProfile.ProfileName:
return &common.CollectorNodeConfiguration{
RequestMemoryMiB: 150,
LimitMemoryMiB: 300,
}
case k8sprofiles.SizeMProfile.ProfileName:
return &common.CollectorNodeConfiguration{
RequestMemoryMiB: 250,
LimitMemoryMiB: 500,
}
case k8sprofiles.SizeLProfile.ProfileName:
return &common.CollectorNodeConfiguration{
RequestMemoryMiB: 500,
LimitMemoryMiB: 750,
}
}
}
// Return nil if no matching profile is found.
return nil
}

func GetGatewayConfigBasedOnSize(profile common.ProfileName) *common.CollectorGatewayConfiguration {
aggregateProfiles := append([]common.ProfileName{profile}, k8sprofiles.ProfilesMap[profile].Dependencies...)

Expand Down
25 changes: 25 additions & 0 deletions common/odigos_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,31 @@ type CollectorNodeConfiguration struct {
// The port to use for exposing the collector's own metrics as a prometheus endpoint.
// This can be used to resolve conflicting ports when a collector is using the host network.
CollectorOwnMetricsPort int32 `json:"collectorOwnMetricsPort,omitempty"`

// RequestMemoryMiB is the memory request for the node collector daemonset.
// it will be embedded in the daemonset as a resource request of the form "memory: <value>Mi"
// default value is 250Mi
RequestMemoryMiB int `json:"requestMemoryMiB,omitempty"`

// LimitMemoryMiB is the memory limit for the node collector daemonset.
// it will be embedded in the daemonset as a resource limit of the form "memory: <value>Mi"
// default value is 2x the memory request.
LimitMemoryMiB int `json:"limitMemoryMiB,omitempty"`

// this parameter sets the "limit_mib" parameter in the memory limiter configuration for the node collector.
// it is the hard limit after which a force garbage collection will be performed.
// if not set, it will be 50Mi below the memory request.
MemoryLimiterLimitMiB int `json:"memoryLimiterLimitMiB,omitempty"`

// this parameter sets the "spike_limit_mib" parameter in the memory limiter configuration for the node collector.
// note that this is not the processor soft limit, but the diff in Mib between the hard limit and the soft limit.
// if not set, this will be set to 20% of the hard limit (so the soft limit will be 80% of the hard limit).
MemoryLimiterSpikeLimitMiB int `json:"memoryLimiterSpikeLimitMiB,omitempty"`

// the GOMEMLIMIT environment variable value for the node collector daemonset.
// this is when go runtime will start garbage collection.
// if not specified, it will be set to 80% of the hard limit of the memory limiter.
GoMemLimitMib int `json:"goMemLimitMiB,omitempty"`
}

type CollectorGatewayConfiguration struct {
Expand Down
50 changes: 44 additions & 6 deletions docs/pipeline/configuration.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Odigos sets up an observability pipeline in your Kubernetes cluster to collect,
User can configure the following components:

- **Cluster Gateway Collector** - a collector that runs as a k8s Deployment. It receives the OpenTelemetry data from the Node Collectors, processes it, and exports it to the configured destinations.
- **Node Data Collection Collector** - a collector that runs as a k8s DaemonSet. It collects / scrape the OpenTelemetry data from the applications running on the nodes and forwards it to the Cluster Gateway Collector.

### Configuring the Pipeline

Expand All @@ -23,13 +24,24 @@ When configuring the pipeline, you must use **only one of the two methods**—ei

#### 1. Using Profiles

Sizing Profiles `size_s`, `size_m`, `size_l` are pre-defined configurations designed to simplify pipeline configurations. Each profile specifies the following parameters for the **Cluster Gateway Collector**:
Sizing Profiles `size_s`, `size_m`, `size_l` are pre-defined configurations designed to simplify pipeline configurations. Each profile specifies the following parameters:

**Cluster Gateway Collector**:

| Profile | Minimum Replicas | HPA Maximum Replicas | Request CPU (m) | Limit CPU (m) | Request Memory (Mi) | Limit Memory (Mi) |
|----------|-----------------------|----------------------|-----------------|---------------|----------------------|-------------------|
| `size_s` | **1** | **5** | **150m** | **300m** | **300Mi** | **300Mi** |
| `size_m` | **2** | **8** | **500m** | **1000m** | **500Mi** | **600Mi** |
| `size_l` | **3** | **12** | **750m** | **1250m** | **750Mi** | **850Mi** |

**Node Data Collection Collector**:

| Profile | Request Memory (Mi) | Limit Memory (Mi) |
|----------|----------------------|-------------------|
| `size_s` | **150Mi** | **300Mi** |
| `size_m` | **250Mi** | **500Mi** |
| `size_l` | **500Mi** | **750Mi** |

| Profile | Minimum Replicas | HPA Maximum Replicas | Request CPU (m) | Limit CPU (m) | Request Memory (Mi)
|----------|-----------------------|----------------------|-----------------|---------------|----------------------|
| `size_s` | **1** | **5** | **150m** | **300m** | **300Mi** |
| `size_m` | **2** | **8** | **500m** | **1000m** | **500Mi** |
| `size_l` | **3** | **12** | **750m** | **1250m** | **750Mi** |

To use profiles, you need to use the [Odigos CLI Command for Profiles](/cli/odigos_profile).
This simplifies the setup process and ensures optimized settings for typical use cases.
Expand Down Expand Up @@ -79,6 +91,32 @@ collectorGateway:
# This is when the Go runtime will start garbage collection.
# Default: 80% of the hard limit of the memory limiter.
goMemLimitMiB:

collectorNode:
# RequestMemoryMiB is the memory request for the node collector daemonset.
# it will be embedded in the daemonset as a resource request of the form "memory: <value>Mi"
# default value is 250Mi
RequestMemoryMiB int `json:"requestMemoryMiB,omitempty"`

# LimitMemoryMiB is the memory limit for the node collector daemonset.
# it will be embedded in the daemonset as a resource limit of the form "memory: <value>Mi"
# default value is 2x the memory request.
LimitMemoryMiB int `json:"limitMemoryMiB,omitempty"`

# this parameter sets the "limit_mib" parameter in the memory limiter configuration for the node collector.
# it is the hard limit after which a force garbage collection will be performed.
# if not set, it will be 50Mi below the memory request.
MemoryLimiterLimitMiB int `json:"memoryLimiterLimitMiB,omitempty"`

# this parameter sets the "spike_limit_mib" parameter in the memory limiter configuration for the node collector.
# note that this is not the processor soft limit, but the diff in Mib between the hard limit and the soft limit.
# if not set, this will be set to 20% of the hard limit (so the soft limit will be 80% of the hard limit).
MemoryLimiterSpikeLimitMiB int `json:"memoryLimiterSpikeLimitMiB,omitempty"`

# the GOMEMLIMIT environment variable value for the node collector daemonset.
# this is when go runtime will start garbage collection.
# if not specified, it will be set to 80% of the hard limit of the memory limiter.
GoMemLimitMib int `json:"goMemLimitMiB,omitempty"`
```


Expand Down
15 changes: 15 additions & 0 deletions helm/odigos/templates/odigos-config-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,21 @@ data:
{{- with .Values.collectorNode.collectorOwnMetricsPort }}
collectorOwnMetricsPort: {{ . }}
{{- end }}
{{- with .Values.collectorNode.requestMemoryMiB }}
requestMemoryMiB: {{ . }}
{{- end }}
{{- with .Values.collectorNode.limitMemoryMiB }}
limitMemoryMiB: {{ . }}
{{- end }}
{{- with .Values.collectorNode.memoryLimiterLimitMiB }}
memoryLimiterLimitMiB: {{ . }}
{{- end }}
{{- with .Values.collectorNode.memoryLimiterSpikeLimitMiB }}
memoryLimiterSpikeLimitMiB: {{ . }}
{{- end }}
{{- with .Values.collectorNode.goMemLimitMiB }}
goMemLimitMiB: {{ . }}
{{- end }}
{{- end }}
instrumentorImage: {{ .Values.instrumentor.image.repository }}
telemetryEnabled: {{ .Values.telemetry.enabled }}
Expand Down
25 changes: 25 additions & 0 deletions helm/odigos/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,31 @@ collectorNode:
# This can be used to resolve conflicting ports when a collector is using the host network.
collectorOwnMetricsPort: 55682

# RequestMemoryMiB is the memory request for the node collector daemonset.
# it will be embedded in the daemonset as a resource request of the form "memory: <value>Mi"
# default value is 250Mi
requestMemoryMiB: 250

# LimitMemoryMiB is the memory limit for the node collector daemonset.
# it will be embedded in the daemonset as a resource limit of the form "memory: <value>Mi"
# default value is 2x the memory request.
limitMemoryMiB: 500

# this parameter sets the "limit_mib" parameter in the memory limiter configuration for the node collector.
# it is the hard limit after which a force garbage collection will be performed.
# if not set, it will be 50Mi below the memory limit.
memoryLimiterLimitMiB: 450

# this parameter sets the "spike_limit_mib" parameter in the memory limiter configuration for the node collector.
# note that this is not the processor soft limit, but the diff in Mib between the hard limit and the soft limit.
# if not set, this will be set to 20% of the hard limit (so the soft limit will be 80% of the hard limit).
memoryLimiterSpikeLimitMiB: 55

# the GOMEMLIMIT environment variable value for the node collector daemonset.
# this is when go runtime will start garbage collection.
# if not specified, it will be set to 80% of the hard limit of the memory limiter.
goMemLimitMiB: 360

autoscaler:
image:
repository: keyval/odigos-autoscaler
Expand Down
2 changes: 1 addition & 1 deletion k8sutils/pkg/profiles/profile.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ type Object interface {
}

var (
// sizing profiles for the collector gateway
// sizing profiles for the collectors resource settings
SizeSProfile = Profile{
ProfileName: common.ProfileName("size_s"),
ShortDescription: "Small size deployment profile",
Expand Down
62 changes: 54 additions & 8 deletions scheduler/controllers/nodecollectorsgroup/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,30 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

const (
// the default memory request in MiB
defaultRequestMemoryMiB = 256

// this configures the processor limit_mib, which is the hard limit in MiB, afterwhich garbage collection will be forced.
// as recommended by the processor docs, if not set, this is set to 50MiB less than the memory limit of the collector
defaultMemoryLimiterLimitDiffMib = 50

// the soft limit will be set to 80% of the hard limit.
// this value is used to derive the "spike_limit_mib" parameter in the processor configuration if a value is not set
defaultMemoryLimiterSpikePercentage = 20.0

// the percentage out of the memory limiter hard limit, at which go runtime will start garbage collection.
// it is used to calculate the GOMEMLIMIT environment variable value.
defaultGoMemLimitPercentage = 80.0

// the memory settings should prevent the collector from exceeding the memory request.
// however, the mechanism is heuristic and does not guarantee to prevent OOMs.
// allowing the memory limit to be slightly above the memory request can help in reducing the chances of OOMs in edge cases.
// instead of having the process killed, it can use extra memory available on the node without allocating it preemptively.
memoryLimitAboveRequestFactor = 2.0
)

func getMemorySettings(odigosConfig common.OdigosConfiguration) odigosv1.CollectorsGroupResourcesSettings {
// TODO: currently using hardcoded values, should be configurable.
//
// memory request is expensive on daemonsets since it will consume this memory
// on each node in the cluster. setting to 256, but allowing memory to spike higher
// to consume more available memory on the node.
Expand All @@ -31,13 +52,38 @@ func getMemorySettings(odigosConfig common.OdigosConfiguration) odigosv1.Collect
// - limit is set to request: collector most stable (no OOM) but smaller buffer for bursts and early data drop.
// - limit is set way above request: in case of memory spike, collector will use extra memory available on the node to buffer data, but might get killed by OOM killer if this memory is not available.
// currently choosing 512MiB as a balance (200MiB guaranteed for heap, and the rest ~300MiB of buffer from node before start dropping).
//

nodeCollectorConfig := odigosConfig.CollectorNode

memoryRequestMiB := defaultRequestMemoryMiB
if nodeCollectorConfig != nil && nodeCollectorConfig.RequestMemoryMiB > 0 {
memoryRequestMiB = nodeCollectorConfig.RequestMemoryMiB
}
memoryLimitMiB := int(float64(memoryRequestMiB) * memoryLimitAboveRequestFactor)
if nodeCollectorConfig != nil && nodeCollectorConfig.LimitMemoryMiB > 0 {
memoryLimitMiB = nodeCollectorConfig.LimitMemoryMiB
}

memoryLimiterLimitMiB := memoryLimitMiB - defaultMemoryLimiterLimitDiffMib
if nodeCollectorConfig != nil && nodeCollectorConfig.MemoryLimiterLimitMiB > 0 {
memoryLimiterLimitMiB = nodeCollectorConfig.MemoryLimiterLimitMiB
}
memoryLimiterSpikeLimitMiB := memoryLimiterLimitMiB * defaultMemoryLimiterSpikePercentage / 100
if nodeCollectorConfig != nil && nodeCollectorConfig.MemoryLimiterSpikeLimitMiB > 0 {
memoryLimiterSpikeLimitMiB = nodeCollectorConfig.MemoryLimiterSpikeLimitMiB
}

gomemlimitMiB := int(memoryLimiterLimitMiB * defaultGoMemLimitPercentage / 100.0)
if nodeCollectorConfig != nil && nodeCollectorConfig.GoMemLimitMib != 0 {
gomemlimitMiB = nodeCollectorConfig.GoMemLimitMib
}

return odigosv1.CollectorsGroupResourcesSettings{
MemoryRequestMiB: 256,
MemoryLimitMiB: 512 + 64,
MemoryLimiterLimitMiB: 512,
MemoryLimiterSpikeLimitMiB: 128, // meaning that collector will start dropping data at 512-128=384MiB
GomemlimitMiB: 512 - 128 - 32, // start aggressive GC 32 MiB before soft limit and dropping data
MemoryRequestMiB: memoryRequestMiB,
MemoryLimitMiB: memoryLimitMiB,
MemoryLimiterLimitMiB: memoryLimiterLimitMiB,
MemoryLimiterSpikeLimitMiB: memoryLimiterSpikeLimitMiB,
GomemlimitMiB: gomemlimitMiB,
}
}

Expand Down
3 changes: 1 addition & 2 deletions tests/e2e/workload-lifecycle/chainsaw-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,7 @@ spec:
try:
- script:
timeout: 60s
content: |
content: |
sleep 20
while true; do
Expand Down

0 comments on commit 0134e62

Please sign in to comment.