Skip to content

Commit

Permalink
Merge branch 'main' into kratos-to-use-category-attributes-v2
Browse files Browse the repository at this point in the history
  • Loading branch information
tamirdavid1 authored Nov 24, 2024
2 parents a5aec42 + 1b69e3d commit 457b189
Show file tree
Hide file tree
Showing 39 changed files with 695 additions and 465 deletions.
52 changes: 52 additions & 0 deletions api/config/crd/bases/odigos.io_collectorsgroups.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,65 @@ spec:
This can be used to resolve conflicting ports when a collector is using the host network.
format: int32
type: integer
memorySettings:
description: |-
Memory settings for the collectors group.
these settings are used to protect the collectors instances from:
- running out of memory and being killed by the k8s OOM killer
- consuming all available memory on the node which can lead to node instability
- pushing back pressure to the instrumented applications
properties:
gomemlimitMiB:
description: |-
the GOMEMLIMIT environment variable value for the collector pod.
this is when go runtime will start garbage collection.
it is recommended to be set to 80% of the hard limit of the memory limiter.
type: integer
memoryLimitMiB:
description: |-
This option sets the limit on the memory usage of the collector.
since the memory limiter mechanism is heuristic, and operates on fixed intervals,
while it cannot fully prevent OOMs, it can help in reducing the chances of OOMs in edge cases.
the settings should prevent the collector from exceeding the memory request,
so one can set this to the same value as the memory request or higher to allow for some buffer for bursts.
type: integer
memoryLimiterLimitMiB:
description: |-
this parameter sets the "limit_mib" parameter in the memory limiter configuration for the collector.
it is the hard limit after which a force garbage collection will be performed.
this value will end up comparing against the go runtime reported heap Alloc value.
According to the memory limiter docs:
> Note that typically the total memory usage of process will be about 50MiB higher than this value
a test from nov 2024 showed that fresh odigos collector with no traffic takes 38MiB,
thus the 50MiB is a good value to start with.
type: integer
memoryLimiterSpikeLimitMiB:
description: |-
this parameter sets the "spike_limit_mib" parameter in the memory limiter configuration for the collector memory limiter.
note that this is not the processor soft limit itself, but the diff in Mib between the hard limit and the soft limit.
according to the memory limiter docs, it is recommended to set this to 20% of the hard limit.
changing this value allows trade-offs between memory usage and resiliency to bursts.
type: integer
memoryRequestMiB:
description: |-
MemoryRequestMiB is the memory resource request to be used on the pod template.
it will be embedded in the as a resource request of the form "memory: <value>Mi"
type: integer
required:
- gomemlimitMiB
- memoryLimitMiB
- memoryLimiterLimitMiB
- memoryLimiterSpikeLimitMiB
- memoryRequestMiB
type: object
role:
enum:
- CLUSTER_GATEWAY
- NODE_COLLECTOR
type: string
required:
- collectorOwnMetricsPort
- memorySettings
- role
type: object
status:
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions api/generated/odigos/applyconfiguration/utils.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 45 additions & 0 deletions api/odigos/v1alpha1/collectorsgroup_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,58 @@ const (
CollectorsGroupRoleNodeCollector CollectorsGroupRole = CollectorsGroupRole(k8sconsts.CollectorsRoleNodeCollector)
)

// The raw values of the memory settings for the collectors group.
// any defaulting, validations and calculations should be done in the controllers
// that create this CR.
// Values will be used as is without any further processing.
type CollectorsGroupMemorySettings struct {

// MemoryRequestMiB is the memory resource request to be used on the pod template.
// it will be embedded in the as a resource request of the form "memory: <value>Mi"
MemoryRequestMiB int `json:"memoryRequestMiB"`

// This option sets the limit on the memory usage of the collector.
// since the memory limiter mechanism is heuristic, and operates on fixed intervals,
// while it cannot fully prevent OOMs, it can help in reducing the chances of OOMs in edge cases.
// the settings should prevent the collector from exceeding the memory request,
// so one can set this to the same value as the memory request or higher to allow for some buffer for bursts.
MemoryLimitMiB int `json:"memoryLimitMiB"`

// this parameter sets the "limit_mib" parameter in the memory limiter configuration for the collector.
// it is the hard limit after which a force garbage collection will be performed.
// this value will end up comparing against the go runtime reported heap Alloc value.
// According to the memory limiter docs:
// > Note that typically the total memory usage of process will be about 50MiB higher than this value
// a test from nov 2024 showed that fresh odigos collector with no traffic takes 38MiB,
// thus the 50MiB is a good value to start with.
MemoryLimiterLimitMiB int `json:"memoryLimiterLimitMiB"`

// this parameter sets the "spike_limit_mib" parameter in the memory limiter configuration for the collector memory limiter.
// note that this is not the processor soft limit itself, but the diff in Mib between the hard limit and the soft limit.
// according to the memory limiter docs, it is recommended to set this to 20% of the hard limit.
// changing this value allows trade-offs between memory usage and resiliency to bursts.
MemoryLimiterSpikeLimitMiB int `json:"memoryLimiterSpikeLimitMiB"`

// the GOMEMLIMIT environment variable value for the collector pod.
// this is when go runtime will start garbage collection.
// it is recommended to be set to 80% of the hard limit of the memory limiter.
GomemlimitMiB int `json:"gomemlimitMiB"`
}

// CollectorsGroupSpec defines the desired state of Collector
type CollectorsGroupSpec struct {
Role CollectorsGroupRole `json:"role"`

// The port to use for exposing the collector's own metrics as a prometheus endpoint.
// This can be used to resolve conflicting ports when a collector is using the host network.
CollectorOwnMetricsPort int32 `json:"collectorOwnMetricsPort"`

// Memory settings for the collectors group.
// these settings are used to protect the collectors instances from:
// - running out of memory and being killed by the k8s OOM killer
// - consuming all available memory on the node which can lead to node instability
// - pushing back pressure to the instrumented applications
MemorySettings CollectorsGroupMemorySettings `json:"memorySettings"`
}

// CollectorsGroupStatus defines the observed state of Collector
Expand Down
16 changes: 16 additions & 0 deletions api/odigos/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions autoscaler/controllers/gateway/configmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,13 @@ func addSelfTelemetryPipeline(c *config.Config, ownTelemetryPort int32) error {
return nil
}

func syncConfigMap(dests *odigosv1.DestinationList, allProcessors *odigosv1.ProcessorList, gateway *odigosv1.CollectorsGroup, ctx context.Context, c client.Client, scheme *runtime.Scheme, memConfig *memoryConfigurations) (string, []odigoscommon.ObservabilitySignal, error) {
func syncConfigMap(dests *odigosv1.DestinationList, allProcessors *odigosv1.ProcessorList, gateway *odigosv1.CollectorsGroup, ctx context.Context, c client.Client, scheme *runtime.Scheme) (string, []odigoscommon.ObservabilitySignal, error) {
logger := log.FromContext(ctx)

memoryLimiterConfiguration := config.GenericMap{
"check_interval": "1s",
"limit_mib": memConfig.memoryLimiterLimitMiB,
"spike_limit_mib": memConfig.memoryLimiterSpikeLimitMiB,
"limit_mib": gateway.Spec.MemorySettings.MemoryLimiterLimitMiB,
"spike_limit_mib": gateway.Spec.MemorySettings.MemoryLimiterSpikeLimitMiB,
}

processors := common.FilterAndSortProcessorsByOrderHint(allProcessors, odigosv1.CollectorsGroupRoleClusterGateway)
Expand Down
14 changes: 9 additions & 5 deletions autoscaler/controllers/gateway/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ const (
)

func syncDeployment(dests *odigosv1.DestinationList, gateway *odigosv1.CollectorsGroup, configData string,
ctx context.Context, c client.Client, scheme *runtime.Scheme, imagePullSecrets []string, odigosVersion string, memConfig *memoryConfigurations) (*appsv1.Deployment, error) {
ctx context.Context, c client.Client, scheme *runtime.Scheme, imagePullSecrets []string, odigosVersion string) (*appsv1.Deployment, error) {
logger := log.FromContext(ctx)

secretsVersionHash, err := destinationsSecretsVersionsHash(ctx, c, dests)
Expand All @@ -44,7 +44,7 @@ func syncDeployment(dests *odigosv1.DestinationList, gateway *odigosv1.Collector

// Calculate the hash of the config data and the secrets version hash, this is used to make sure the gateway will restart when the config changes
configDataHash := common.Sha256Hash(fmt.Sprintf("%s-%s", configData, secretsVersionHash))
desiredDeployment, err := getDesiredDeployment(dests, configDataHash, gateway, scheme, imagePullSecrets, odigosVersion, memConfig)
desiredDeployment, err := getDesiredDeployment(dests, configDataHash, gateway, scheme, imagePullSecrets, odigosVersion)
if err != nil {
return nil, errors.Join(err, errors.New("failed to get desired deployment"))
}
Expand Down Expand Up @@ -88,9 +88,10 @@ func patchDeployment(existing *appsv1.Deployment, desired *appsv1.Deployment, ct
}

func getDesiredDeployment(dests *odigosv1.DestinationList, configDataHash string,
gateway *odigosv1.CollectorsGroup, scheme *runtime.Scheme, imagePullSecrets []string, odigosVersion string, memConfig *memoryConfigurations) (*appsv1.Deployment, error) {
gateway *odigosv1.CollectorsGroup, scheme *runtime.Scheme, imagePullSecrets []string, odigosVersion string) (*appsv1.Deployment, error) {

requestMemoryQuantity := resource.MustParse(fmt.Sprintf("%dMi", memConfig.memoryRequestMiB))
requestMemoryQuantity := resource.MustParse(fmt.Sprintf("%dMi", gateway.Spec.MemorySettings.MemoryRequestMiB))
limitMemoryQuantity := resource.MustParse(fmt.Sprintf("%dMi", gateway.Spec.MemorySettings.MemoryLimitMiB))

desiredDeployment := &appsv1.Deployment{
ObjectMeta: v1.ObjectMeta{
Expand Down Expand Up @@ -158,7 +159,7 @@ func getDesiredDeployment(dests *odigosv1.DestinationList, configDataHash string
},
{
Name: "GOMEMLIMIT",
Value: fmt.Sprintf("%dMiB", memConfig.gomemlimitMiB),
Value: fmt.Sprintf("%dMiB", gateway.Spec.MemorySettings.GomemlimitMiB),
},
},
SecurityContext: &corev1.SecurityContext{
Expand Down Expand Up @@ -190,6 +191,9 @@ func getDesiredDeployment(dests *odigosv1.DestinationList, configDataHash string
Requests: corev1.ResourceList{
corev1.ResourceMemory: requestMemoryQuantity,
},
Limits: corev1.ResourceList{
corev1.ResourceMemory: limitMemoryQuantity,
},
},
},
},
Expand Down
4 changes: 2 additions & 2 deletions autoscaler/controllers/gateway/hpa.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ var (
stabilizationWindowSeconds = intPtr(300) // cooldown period for scaling down
)

func syncHPA(gateway *odigosv1.CollectorsGroup, ctx context.Context, c client.Client, scheme *runtime.Scheme, memConfig *memoryConfigurations, kubeVersion *version.Version) error {
func syncHPA(gateway *odigosv1.CollectorsGroup, ctx context.Context, c client.Client, scheme *runtime.Scheme, kubeVersion *version.Version) error {
logger := log.FromContext(ctx)

var hpa client.Object

memLimit := memConfig.gomemlimitMiB * memoryLimitPercentageForHPA / 100.0
memLimit := gateway.Spec.MemorySettings.GomemlimitMiB * memoryLimitPercentageForHPA / 100.0
metricQuantity := resource.MustParse(fmt.Sprintf("%dMi", memLimit))

switch {
Expand Down
Loading

0 comments on commit 457b189

Please sign in to comment.