From 8a082857287364c37348622a68ebdda701ca5928 Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Thu, 12 Feb 2026 14:38:14 +0100 Subject: [PATCH 01/17] metadata node fetchers --- internal/test/integration/aws_test.go | 104 ++++++++++ .../configs/aws-metadata-mock.json | 188 ++++++++++++++++++ pkg/appolly/meta/meta_node.go | 111 +++++++++++ pkg/appolly/meta/meta_node_test.go | 58 ++++++ pkg/internal/helpers/iters/iters.go | 42 ++++ pkg/internal/helpers/iters/iters_test.go | 44 ++++ 6 files changed, 547 insertions(+) create mode 100644 internal/test/integration/aws_test.go create mode 100644 internal/test/integration/configs/aws-metadata-mock.json create mode 100644 pkg/appolly/meta/meta_node.go create mode 100644 pkg/appolly/meta/meta_node_test.go create mode 100644 pkg/internal/helpers/iters/iters.go create mode 100644 pkg/internal/helpers/iters/iters_test.go diff --git a/internal/test/integration/aws_test.go b/internal/test/integration/aws_test.go new file mode 100644 index 0000000000..807e617617 --- /dev/null +++ b/internal/test/integration/aws_test.go @@ -0,0 +1,104 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package integration + +import ( + "fmt" + "testing" + "time" + + "github.com/ory/dockertest/v3" + "github.com/ory/dockertest/v3/docker" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.opentelemetry.io/obi/internal/test/integration/components/promtest" + ti "go.opentelemetry.io/obi/pkg/test/integration" +) + +func setupMockIMDS(t *testing.T, network *dockertest.Network) { + t.Helper() + + t.Log("Starting AWS EC2 Metadata Mock container...") + mockIMDS, err := dockerPool.RunWithOptions(&dockertest.RunOptions{ + Repository: "amazon/amazon-ec2-metadata-mock", + Tag: "v1.9.2", + Name: fmt.Sprintf("mock-imds-test-%d", time.Now().UnixNano()), + Mounts: []string{ + pathRoot + "/internal/test/integration/configs/aws-metadata-mock.json:/config/aws-metadata-mock.json", + }, + Cmd: []string{ + "--config-file", "/config/aws-metadata-mock.json", + "--port", "1338", + }, + ExposedPorts: []string{"1338/tcp"}, + }) + require.NoError(t, err, "could not start AWS EC2 Metadata Mock container") + t.Cleanup(func() { + require.NoError(t, dockerPool.Purge(mockIMDS), "could not remove AWS EC2 Metadata Mock container") + }) + + // Connect to network with alias for metadata service + err = dockerPool.Client.ConnectNetwork(network.Network.ID, docker.NetworkConnectionOptions{ + Container: mockIMDS.Container.ID, + EndpointConfig: &docker.EndpointConfig{ + Aliases: []string{"mock-imds"}, + }, + }) + require.NoError(t, err, "could not connect AWS EC2 Metadata Mock container to network") + t.Log("AWS EC2 Metadata Mock container started") +} + +// This file contains tests related with the integration with Amazon Web Services +func TestClusterName(t *testing.T) { + clusterName := "test-eks-cluster" + + network := setupDockerNetwork(t) + setupContainerPrometheus(t, network, "prometheus-config.yml") + setupContainerJaeger(t, network) + setupContainerCollector(t, network, "otelcol-config.yml") + setupMockIMDS(t, network) + defer network.Close() + testserver := setupGoOTelTestServer(t, network, nil) + + if t.Failed() { + return + } + + // Start OBI to instrument the test server + // Configure OBI to use the mock IMDS by setting the EC2 metadata endpoint + o := obi{ + Env: []string{ + "OTEL_EBPF_OPEN_PORT=8080", + // Configure AWS SDK to use custom endpoint for EC2 metadata + // The official amazon-ec2-metadata-mock runs on port 1338 + "AWS_EC2_METADATA_SERVICE_ENDPOINT=http://mock-imds:1338", + }, + } + if !KernelLockdownMode() { + o.SecurityConfigSuffix = "_none" + } + o.instrument(t, network, testserver, "obi-config-aws.yml") + + t.Run("Cluster name from EC2 metadata", func(t *testing.T) { + // Wait for test components to be ready + waitForTestComponents(t, "http://localhost:8080") + + // Make some requests to generate metrics + for range 4 { + ti.DoHTTPGet(t, "http://localhost:8080/rolldice", 200) + } + + // Query Prometheus for target_info with cluster_name attribute + pq := promtest.Client{HostPort: prometheusHostPort} + + // Check that the cluster_name appears in the target_info metric + require.EventuallyWithT(t, func(ct *assert.CollectT) { + query := fmt.Sprintf(`target_info{k8s_cluster_name="%s"}`, clusterName) + results, err := pq.Query(query) + require.NoError(ct, err, "failed to query Prometheus") + assert.NotEmpty(ct, results, "target_info with k8s_cluster_name should exist") + }, testTimeout, 500*time.Millisecond) + }) +} diff --git a/internal/test/integration/configs/aws-metadata-mock.json b/internal/test/integration/configs/aws-metadata-mock.json new file mode 100644 index 0000000000..156c2d085b --- /dev/null +++ b/internal/test/integration/configs/aws-metadata-mock.json @@ -0,0 +1,188 @@ +{ + "metadata": { + "paths": { + "ami-id": "/latest/meta-data/ami-id", + "ami-launch-index": "/latest/meta-data/ami-launch-index", + "ami-manifest-path": "/latest/meta-data/ami-manifest-path", + "block-device-mapping-ami": "/latest/meta-data/block-device-mapping/ami", + "block-device-mapping-ebs": "/latest/meta-data/block-device-mapping/ebs0", + "block-device-mapping-ephemeral": "/latest/meta-data/block-device-mapping/ephemeral0", + "block-device-mapping-root": "/latest/meta-data/block-device-mapping/root", + "block-device-mapping-swap": "/latest/meta-data/block-device-mapping/swap", + "elastic-inference-associations": "/latest/meta-data/elastic-inference/associations", + "elastic-inference-accelerator": "/latest/meta-data/elastic-inference/associations/eia-bfa21c7904f64a82a21b9f4540169ce1", + "events": "/latest/meta-data/events/maintenance/scheduled", + "hostname": "/latest/meta-data/hostname", + "iam-info": "/latest/meta-data/iam/info", + "iam-security-credentials-role": "/latest/meta-data/iam/security-credentials", + "iam-security-credentials": "/latest/meta-data/iam/security-credentials/baskinc-role", + "instance-action": "/latest/meta-data/instance-action", + "instance-id": "/latest/meta-data/instance-id", + "instance-life-cycle": "/latest/meta-data/instance-life-cycle", + "instance-type": "/latest/meta-data/instance-type", + "kernel-id": "/latest/meta-data/kernel-id", + "local-hostname": "/latest/meta-data/local-hostname", + "local-ipv4": "/latest/meta-data/local-ipv4", + "mac": "/latest/meta-data/mac", + "mac-device-number": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/device-number", + "mac-ipv4-associations": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/ipv4-associations/192.0.2.54", + "mac-ipv6-associations": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/ipv6s", + "mac-local-hostname": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/local-hostname", + "mac-local-ipv4s": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/local-ipv4s", + "mac-mac": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/mac", + "mac-network-interface-id": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/interface-id", + "mac-network-interface-card-index": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/network-card-index", + "mac-owner-id": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/owner-id", + "mac-public-hostname": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/public-hostname", + "mac-public-ipv4s": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/public-ipv4s", + "mac-security-group-ids": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/security-group-ids", + "mac-security-groups": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/security-groups", + "mac-subnet-id": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/subnet-id", + "mac-subnet-ipv4-cidr-block": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/subnet-ipv4-cidr-block", + "mac-subnet-ipv6-cidr-blocks": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/subnet-ipv6-cidr-blocks", + "mac-vpc-id": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/vpc-id", + "mac-vpc-ipv4-cidr-block": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/vpc-ipv4-cidr-block", + "mac-vpc-ipv4-cidr-blocks": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/vpc-ipv4-cidr-blocks", + "mac-vpc-ipv6-cidr-blocks": "/latest/meta-data/network/interfaces/macs/0e:49:61:0f:c3:11/vpc-ipv6-cidr-blocks", + "placement-availability-zone": "/latest/meta-data/placement/availability-zone", + "placement-availability-zone-id": "/latest/meta-data/placement/availability-zone-id", + "placement-group-name": "/latest/meta-data/placement/group-name", + "placement-host-id": "/latest/meta-data/placement/host-id", + "placement-partition-number": "/latest/meta-data/placement/partition-number", + "placement-region": "/latest/meta-data/placement/region", + "product-codes": "/latest/meta-data/product-codes", + "public-hostname": "/latest/meta-data/public-hostname", + "public-ipv4": "/latest/meta-data/public-ipv4", + "public-key": "/latest/meta-data/public-keys/0/openssh-key", + "ramdisk-id": "/latest/meta-data/ramdisk-id", + "reservation-id": "/latest/meta-data/reservation-id", + "security-groups": "/latest/meta-data/security-groups", + "services-domain": "/latest/meta-data/services/domain", + "services-partition": "/latest/meta-data/services/partition", + "spot": "/latest/meta-data/spot/instance-action", + "spot-termination-time": "/latest/meta-data/spot/termination-time", + "rebalance-rec-time": "/latest/meta-data/events/recommendations/rebalance", + "tags-instance-name": "/latest/meta-data/tags/instance/Name", + "tags-instance-test": "/latest/meta-data/tags/instance/Test", + "target-lifecycle-state": "/latest/meta-data/autoscaling/target-lifecycle-state" + }, + "values": { + "ami-id": "ami-0a887e401f7654935", + "ami-launch-index": "0", + "ami-manifest-path": "(unknown)", + "block-device-mapping-ami": "/dev/xvda", + "block-device-mapping-ebs": "sdb", + "block-device-mapping-ephemeral": "sdb", + "block-device-mapping-root": "/dev/xvda", + "block-device-mapping-swap": "sdcs", + "event-id": "instance-event-1234567890abcdef0", + "hostname": "ip-172-16-34-43.ec2.internal", + "instance-action": "none", + "instance-id": "i-1234567890abcdef0", + "instance-life-cycle": "on-demand", + "instance-type": "m4.xlarge", + "kernel-id": "aki-5c21674b", + "local-hostname": "ip-172-16-34-43.ec2.internal", + "local-ipv4": "172.16.34.43", + "mac": "0e:49:61:0f:c3:11", + "mac-device-number": "0", + "mac-ipv4-associations": "192.0.2.54", + "mac-ipv6-associations": "2001:db8:8:4::2", + "mac-local-hostname": "ip-172-16-34-43.ec2.internal", + "mac-local-ipv4s": "172.16.34.43", + "mac-mac": "0e:49:61:0f:c3:11", + "mac-network-interface-id": "eni-0f95d3625f5c521cc", + "mac-network-interface-card-index": "0", + "mac-owner-id": "515336597381", + "mac-public-hostname": "ec2-192-0-2-54.compute-1.amazonaws.com", + "mac-public-ipv4s": "192.0.2.54", + "mac-security-group-ids": "sg-0b07f8f6cb485d4df", + "mac-security-groups": "ura-launch-wizard-harry-1", + "mac-subnet-id": "subnet-0ac62554", + "mac-subnet-ipv4-cidr-block": "192.0.2.0/24", + "mac-subnet-ipv6-cidr-blocks": "2001:db8::/32", + "mac-vpc-id": "vpc-d295a6a7", + "mac-vpc-ipv4-cidr-block": "192.0.2.0/24", + "mac-vpc-ipv4-cidr-blocks": "192.0.2.0/24", + "mac-vpc-ipv6-cidr-blocks": "2001:db8::/32", + "placement-availability-zone": "us-east-1a", + "placement-availability-zone-id": "use1-az4", + "placement-group-name": "a-placement-group", + "placement-host-id": "h-0da999999f9999fb9", + "placement-partition-number": "1", + "placement-region": "us-east-1", + "product-codes": "3iplms73etrdhxdepv72l6ywj", + "public-hostname": "ec2-192-0-2-54.compute-1.amazonaws.com", + "public-ipv4": "192.0.2.54", + "public-key": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC/JxGByvHDHgQAU+0nRFWdvMPi22OgNUn9ansrI8QN1ZJGxD1ML8DRnJ3Q3zFKqqjGucfNWW0xpVib+ttkIBp8G9P/EOcX9C3FF63O3SnnIUHJsp5faRAZsTJPx0G5HUbvhBvnAcCtSqQgmr02c1l582vAWx48pOmeXXMkl9qe9V/s7K3utmeZkRLo9DqnbsDlg5GWxLC/rWKYaZR66CnMEyZ7yBy3v3abKaGGRovLkHNAgWjSSgmUTI1nT5/S2OLxxuDnsC7+BiABLPaqlIE70SzcWZ0swx68Bo2AY9T9ymGqeAM/1T4yRtg0sPB98TpT7WrY5A3iia2UVtLO/xcTt test", + "ramdisk-id": "ari-01bb5768", + "reservation-id": "r-046cb3eca3e201d2f", + "security-groups": "ura-launch-wizard-harry-1", + "services-domain": "amazonaws.com", + "services-partition": "aws", + "tags-instance-name": "test-instance", + "tags-instance-test": "test-tag", + "iam-info": { + "Code": "Success", + "LastUpdated": "2020-04-02T18:50:40Z", + "InstanceProfileArn": "arn:aws:iam::896453262835:instance-profile/baskinc-role", + "InstanceProfileId": "AIPA5BOGHHXZELSK34VU4" + }, + "iam-security-credentials-role": "baskinc-role", + "iam-security-credentials": { + "Code": "Success", + "LastUpdated": "2020-04-02T18:50:40Z", + "Type": "AWS-HMAC", + "AccessKeyId": "12345678901", + "SecretAccessKey": "v/12345678901", + "Token": "TEST92test48TEST+y6RpoTEST92test48TEST/8oWVAiBqTEsT5Ky7ty2tEStxC1T==", + "Expiration": "2020-04-02T00:49:51Z" + }, + "elastic-inference-associations": "eia-bfa21c7904f64a82a21b9f4540169ce1", + "elastic-inference-accelerator": { + "version_2018_04_12": { + "elastic-inference-accelerator-id": "eia-bfa21c7904f64a82a21b9f4540169ce1", + "elastic-inference-accelerator-type": "eia1.medium" + } + } + } + }, + "dynamic": { + "paths": { + "instance-identity-document": "/latest/dynamic/instance-identity/document", + "instance-identity-pkcs": "/latest/dynamic/instance-identity/pkcs7", + "instance-identity-signature": "/latest/dynamic/instance-identity/signature", + "fws-instance-monitoring": "/latest/dynamic/fws/instance-monitoring" + }, + "values": { + "instance-identity-document": { + "accountId": "0123456789", + "imageId": "ami-0b69ea66ff7391e80", + "availabilityZone": "us-east-1f", + "ramdiskId": null, + "kernelId": null, + "devpayProductCodes": null, + "marketplaceProductCodes": null, + "version": "2017-09-30", + "privateIp": "10.0.7.10", + "billingProducts": null, + "instanceId": "i-1234567890abcdef0", + "pendingTime": "2019-10-31T07:02:24Z", + "architecture": "x86_64", + "instanceType": "m4.xlarge", + "region": "us-east-1" + }, + "instance-identity-pkcs": "TESTCSqGSIb3DQEHZqCZPIZCZQExCzZJGgUrDgPCGgUZPIZGCSqGSIb3DQEHZaCZJIZEggHderog\nICJhY2NvdW50SWQiIDogIjUxNTPzNjU5NzP4PCIsCiZgImFyY2hpdGVjdHVyZSIgOiZieDg2XzY0\nIirKICZiYXZhaWxhYmlsaXR5Wm9uZSIgOiZidXPtZWFzdC0xYSIsCiZgImJpbGxpbmdQcm9kdWN0\ncyIgOiGudWxsLZogICJkZXZrYXlQcm9kdWN0Q29kZXPiIDogbnVsbCrKICZibWFya2V0cGxhY2VQ\ncm9kdWN0Q29kZXPiIDogbnVsbCrKICZiaW1hZ2VJZCIgOiZiYW1pLTGhODg3ZTQrPWY3NjU0OTP1\nIirKICZiaW5zdGFuY2VJZCIgOiZiaS0rYjU5YTdiN2NlN2UzYmIrYSIsCiZgImluc3RhbmNlVHlr\nZSIgOiZibTQueGxhcmdlIirKICZia2VybmVsSWQiIDogbnVsbCrKICZicGVuZGluZ1RpbWUiIDog\nIjIrPjZtPDPtPDJUPjZ6PzY6NThaIirKICZicHJpdmF0ZUlrIiZ6ICIxNzIuPzEuPzQuNDPiLZog\nICJyYW1kaXNrSWQiIDogbnVsbCrKICZicmVnaW9uIiZ6ICJ1cy1lYXN0LTEiLZogICJ2ZXJzaW9u\nIiZ6ICIyPDE3LTZ5LTPrIgp9ZZZZZZZZPYIGGTCCZRUCZQEraTGcPQsrCQYDVQQGErJVUzEZPGcG\nZ1UECGPQV2FzaGluZ3RvbiGTdGF0ZTEQPZ4GZ1UEGxPHU2VhdHRsZTEgPG4GZ1UEChPXQW1hem9u\nIFdlYiGTZXJ2aWNlcyGPTEPCCQCWukjZ5V4aZzZJGgUrDgPCGgUZoF0rGZYJKoZIhvcNZQkDPQsG\nCSqGSIb3DQEHZTZcGgkqhkiG9r0GCQUxDxcNPjZrPzZyPjZzNzZ1WjZjGgkqhkiG9r0GCQQxFgQU\nN1xlDhvo6cYuGjXZ+mlTW66Ff8rrCQYHKoZIzjgEZrQrPC4CFQCjzjYV1zGUZUxTf6rGO0en/PxR\n3ZIVZK589qSkEaslLdCzeX2GnQ6dz9UeZZZZZZZZ", + "instance-identity-signature": "TesTTKmBbj+DUw6ut6BOr4mFGpax/k6BhIbsotUHvSIhqv7oKqwB4HZhgGP2Gvcxtz5m3QGUbnwI\nhy33GWxjn7+qfZ/GUeZB1Ilc+3rW/P9G/tGxIB3HtqB6q2J6B4DOh6CJiH+BnrHazGW+bJD406Nz\neP9n/rGEGGm0cGEbbeB=", + "fws-instance-monitoring": "disabled" + } + }, + "userdata": { + "paths": { + "userdata": "/latest/user-data" + }, + "values": { + "userdata": "MTIzNCxqb2huLHJlYm9vdCx0cnVlCg==" + } + } +} \ No newline at end of file diff --git a/pkg/appolly/meta/meta_node.go b/pkg/appolly/meta/meta_node.go new file mode 100644 index 0000000000..1451d27731 --- /dev/null +++ b/pkg/appolly/meta/meta_node.go @@ -0,0 +1,111 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package meta + +import ( + "context" + "iter" + "log/slog" + "slices" + "sync" + "time" + + attr "go.opentelemetry.io/obi/pkg/export/attributes/names" + "go.opentelemetry.io/obi/pkg/internal/helpers/iters" +) + +func nslog() *slog.Logger { + return slog.With("component", "meta.NodeStore") +} + +// TODO: make configurable +const ( + retryTimeout = 30 * time.Second + retryStartInterval = 500 * time.Millisecond + retryMaxInterval = 5 * time.Second +) + +// host metadata is common to all the instrumented applications within a single +// physical node, cloud instance or local virtual machine. +// This information only needs to be retrieved once at startup, and will be +// directly added in the metrics and traces export, since it has no sense +// configuring an OBI instance to filter by attributes that are static for it. + +// each fetcher implementation will return error only when retrying has sense. +// For example, we must not retry if a cloud API endpoint does not exist or it returns 4xx errors, +// because this would mean that OBI is not being executed in that cloud provider. +// But we can retry if the cloud API endpoint returns 5xx errors, as this would indicate +// a temporary unavailability in the Cloud Metadata sevice. +type fetcher func(ctx context.Context) (iter.Seq2[attr.Name, string], error) + +type NodeStore struct { + entries []Entry +} + +type Entry struct { + Key attr.Name + Value string +} + +func NewNodeStore( + ctx context.Context, + fetchers ...fetcher, +) *NodeStore { + return &NodeStore{ + entries: fetchEntries(ctx, fetchers...), + } +} + +func fetchEntries( + ctx context.Context, + fetchers ...fetcher, +) []Entry { + log := nslog() + wg := sync.WaitGroup{} + // we run in parallel to avoid that timeouts/retries delay the startup too much + // but we want to keep the priority of the fetchers, so later fetchers can override + // some data from previous fetchers + results := make([]iter.Seq2[attr.Name, string], len(fetchers)) + for i, fetch := range fetchers { + wg.Go(func() { + results[i] = backoffFetch(ctx, fetch, log.With("fetcher", i)) + }) + } + wg.Wait() + + jointResults := iters.Concat2(results...) + resultsAsEntry := iters.Map2Seq(jointResults, + func(k attr.Name, v string) Entry { return Entry{Key: k, Value: v} }) + return slices.Collect(resultsAsEntry) +} + +func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) iter.Seq2[attr.Name, string] { + backoff := retryStartInterval + start := time.Now() + for { + seq, err := fetch(ctx) + if err == nil { + return seq + } + // exponential backoff retry strategy + if time.Since(start) > retryTimeout { + log.Warn("timeout reached while looking for metadata. Giving up", "error", err) + return iters.Empty2[attr.Name, string]() + } + log.Debug("can't fetch metadata. Will retry", + "retryAfter", backoff, "error", err) + select { + case <-time.After(backoff): + // continue loop! + case <-ctx.Done(): + log.Debug("context canceled. Exiting") + return iters.Empty2[attr.Name, string]() + } + backoff = min(backoff*2, retryMaxInterval) + } +} + +func (sg *NodeStore) Get() iter.Seq[Entry] { + return slices.Values(sg.entries) +} diff --git a/pkg/appolly/meta/meta_node_test.go b/pkg/appolly/meta/meta_node_test.go new file mode 100644 index 0000000000..c1ee43c1da --- /dev/null +++ b/pkg/appolly/meta/meta_node_test.go @@ -0,0 +1,58 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package meta + +import ( + "context" + "errors" + "iter" + "sync/atomic" + "testing" + "testing/synctest" + + "github.com/stretchr/testify/require" + attr "go.opentelemetry.io/obi/pkg/export/attributes/names" +) + +func TestFetchEntries_RetryAndKeepOrder(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + ctx := context.Background() + + // Create fetchers that fail different numbers of times before succeeding + failOnce := makeFetcherThatFailsNTimes(1, "fetcher1", "value1") + alwaysFails := func(ctx context.Context) (iter.Seq2[attr.Name, string], error) { + return nil, errors.New("permanent failure") + } + failTwice := makeFetcherThatFailsNTimes(2, "fetcher2", "value2") + succeedImmediately := makeFetcherThatFailsNTimes(0, "fetcher3", "value3") + + entries := fetchEntries(ctx, failOnce, alwaysFails, failTwice, succeedImmediately) + + // All fetchers should eventually succeed and return their data + require.Equal(t, []Entry{ + {Key: "fetcher1_1", Value: "value1_1"}, {Key: "fetcher1_2", Value: "value1_2"}, + {Key: "fetcher2_1", Value: "value2_1"}, {Key: "fetcher2_2", Value: "value2_2"}, + {Key: "fetcher3_1", Value: "value3_1"}, {Key: "fetcher3_2", Value: "value3_2"}, + }, entries) + synctest.Wait() + }) +} + +func makeFetcherThatFailsNTimes(failCount int, key, value string) fetcher { + attempts := atomic.Int32{} + return func(ctx context.Context) (iter.Seq2[attr.Name, string], error) { + attempt := attempts.Add(1) + if attempt <= int32(failCount) { + return nil, errors.New("simulated failure") + } + return seq(key, value), nil + } +} + +func seq(key, value string) iter.Seq2[attr.Name, string] { + return func(yield func(attr.Name, string) bool) { + yield(attr.Name(key+"_1"), value+"_1") + yield(attr.Name(key+"_2"), value+"_2") + } +} diff --git a/pkg/internal/helpers/iters/iters.go b/pkg/internal/helpers/iters/iters.go new file mode 100644 index 0000000000..be38ac311c --- /dev/null +++ b/pkg/internal/helpers/iters/iters.go @@ -0,0 +1,42 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +// Package iters provides some helper functions for confortably working with iter.Seq +// and iter.Seq2. +// The code is copied from https://github.com/mariomac/iters, but we only keep there +// the functions we need to minimize external dependencies, in number and surface. + +package iters // import "go.opentelemetry.io/obi/pkg/internal/helpers/iters" + +import "iter" + +// Empty2 returns an empty iter.Seq2 +func Empty2[T1, T2 any]() iter.Seq2[T1, T2] { + return func(_ func(T1, T2) bool) {} +} + +// Concat2 creates a lazily concatenated iter.Seq2 whose elements are all the elements of the first +// provided iter.Seq2 followed by all the elements of the second provided iter.Seq2, followed by the +// elements of the third iter.Seq2 (if any), and so on. +func Concat2[K, V any](seqs ...iter.Seq2[K, V]) iter.Seq2[K, V] { + return func(yield func(K, V) bool) { + for _, seq := range seqs { + for k, v := range seq { + if !yield(k, v) { + return + } + } + } + } +} + +// Map2Seq transforms an input iter.Seq2 into an iter.Seq by applying a mapper function to each element +func Map2Seq[K, V, O any](input iter.Seq2[K, V], mapper func(K, V) O) iter.Seq[O] { + return func(yield func(O) bool) { + for k, v := range input { + if !yield(mapper(k, v)) { + return + } + } + } +} diff --git a/pkg/internal/helpers/iters/iters_test.go b/pkg/internal/helpers/iters/iters_test.go new file mode 100644 index 0000000000..c08ccb1099 --- /dev/null +++ b/pkg/internal/helpers/iters/iters_test.go @@ -0,0 +1,44 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package iters + +import ( + "iter" + "maps" + "slices" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestEmpty2(t *testing.T) { + assert.Empty(t, maps.Collect(Empty2[int, int]())) +} + +func TestConcat2_Map2Seq(t *testing.T) { + var new = func(k []int, v []int) iter.Seq2[int, int] { + return func(yield func(int, int) bool) { + for i := 0; i < len(k) && i < len(v); i++ { + if !yield(k[i], v[i]) { + return + } + } + } + } + var kvTuple = func(k, v int) [2]int { return [2]int{k, v} } + + concat := Concat2[int, int]( + new([]int{1, 2, 3}, []int{4, 5, 6}), + new([]int{7, 8, 9}, []int{10, 11, 12}), + new([]int{13, 14, 15}, []int{16, 17, 18}), + ) + + assert.Equal(t, + [][2]int{{1, 4}, {2, 5}, {3, 6}, {7, 10}, {8, 11}, {9, 12}, {13, 16}, {14, 17}, {15, 18}}, + slices.Collect(Map2Seq(concat, kvTuple))) + // checking that sequences can be iterated multiple times + assert.Equal(t, + [][2]int{{1, 4}, {2, 5}, {3, 6}, {7, 10}, {8, 11}, {9, 12}, {13, 16}, {14, 17}, {15, 18}}, + slices.Collect(Map2Seq(concat, kvTuple))) +} From fb11064ad7851d4be495aab8fa1c54ae9927a9bd Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Thu, 12 Feb 2026 16:57:13 +0100 Subject: [PATCH 02/17] basic node aws retrieval --- go.mod | 2 +- internal/test/integration/aws_test.go | 3 +- internal/test/integration/dockerutil_test.go | 13 +++- pkg/appolly/meta/meta_node.go | 51 +++++++------ pkg/appolly/meta/meta_node_aws.go | 75 ++++++++++++++++++++ pkg/appolly/meta/meta_node_test.go | 49 +++++++++---- pkg/export/attributes/names/attrs.go | 8 ++- pkg/internal/helpers/iters/iters.go | 42 ----------- pkg/internal/helpers/iters/iters_test.go | 44 ------------ 9 files changed, 159 insertions(+), 128 deletions(-) create mode 100644 pkg/appolly/meta/meta_node_aws.go delete mode 100644 pkg/internal/helpers/iters/iters.go delete mode 100644 pkg/internal/helpers/iters/iters_test.go diff --git a/go.mod b/go.mod index 91af101767..56d56e4e4e 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.25.6 require ( github.com/AlessandroPomponio/go-gibberish v0.0.0-20191004143433-a2d4156f0396 + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.15 github.com/caarlos0/env/v9 v9.0.0 github.com/cilium/ebpf v0.20.0 github.com/containers/common v0.64.2 @@ -98,7 +99,6 @@ require ( github.com/aws/aws-sdk-go-v2 v1.40.1 // indirect github.com/aws/aws-sdk-go-v2/config v1.32.3 // indirect github.com/aws/aws-sdk-go-v2/credentials v1.19.3 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.15 // indirect github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.15 // indirect github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.15 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect diff --git a/internal/test/integration/aws_test.go b/internal/test/integration/aws_test.go index 807e617617..21362d9bd2 100644 --- a/internal/test/integration/aws_test.go +++ b/internal/test/integration/aws_test.go @@ -12,7 +12,6 @@ import ( "github.com/ory/dockertest/v3/docker" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "go.opentelemetry.io/obi/internal/test/integration/components/promtest" ti "go.opentelemetry.io/obi/pkg/test/integration" ) @@ -23,7 +22,7 @@ func setupMockIMDS(t *testing.T, network *dockertest.Network) { t.Log("Starting AWS EC2 Metadata Mock container...") mockIMDS, err := dockerPool.RunWithOptions(&dockertest.RunOptions{ Repository: "amazon/amazon-ec2-metadata-mock", - Tag: "v1.9.2", + Tag: versionAWSMetaMock, Name: fmt.Sprintf("mock-imds-test-%d", time.Now().UnixNano()), Mounts: []string{ pathRoot + "/internal/test/integration/configs/aws-metadata-mock.json:/config/aws-metadata-mock.json", diff --git a/internal/test/integration/dockerutil_test.go b/internal/test/integration/dockerutil_test.go index a6337a27ce..b45a8a2067 100644 --- a/internal/test/integration/dockerutil_test.go +++ b/internal/test/integration/dockerutil_test.go @@ -15,6 +15,13 @@ import ( "github.com/stretchr/testify/require" ) +const ( + versionPrometheus = "v2.55.1" + versionJaeger = "1.60" + versionCollector = "0.144.0" + versionAWSMetaMock = "v1.9.2" +) + // setupDockerNetwork initializes a custom network for the test. func setupDockerNetwork(t *testing.T) *dockertest.Network { t.Helper() @@ -36,7 +43,7 @@ func setupContainerPrometheus(t *testing.T, network *dockertest.Network, configF t.Log("Starting Prometheus container...") prometheus, err := dockerPool.RunWithOptions(&dockertest.RunOptions{ Repository: "quay.io/prometheus/prometheus", - Tag: "v2.55.1", + Tag: versionPrometheus, Name: fmt.Sprintf("prometheus-otel-test-%d", time.Now().UnixNano()), Networks: []*dockertest.Network{network}, Mounts: []string{ @@ -66,7 +73,7 @@ func setupContainerJaeger(t *testing.T, network *dockertest.Network) { t.Log("Starting Jaeger container...") jaeger, err := dockerPool.RunWithOptions(&dockertest.RunOptions{ Repository: "jaegertracing/all-in-one", - Tag: "1.60", + Tag: versionJaeger, Name: fmt.Sprintf("jaeger-otel-test-%d", time.Now().UnixNano()), Env: []string{ "COLLECTOR_OTLP_ENABLED=true", @@ -100,7 +107,7 @@ func setupContainerCollector(t *testing.T, network *dockertest.Network, configFi t.Log("Starting OpenTelemetry Collector container...") otelcol, err := dockerPool.RunWithOptions(&dockertest.RunOptions{ Repository: "otel/opentelemetry-collector-contrib", - Tag: "0.144.0", + Tag: versionCollector, Name: fmt.Sprintf("otelcol-otel-test-%d", time.Now().UnixNano()), Cmd: []string{"--config=/etc/otelcol-config/" + configFile}, Mounts: []string{ diff --git a/pkg/appolly/meta/meta_node.go b/pkg/appolly/meta/meta_node.go index 1451d27731..0e424e9577 100644 --- a/pkg/appolly/meta/meta_node.go +++ b/pkg/appolly/meta/meta_node.go @@ -5,14 +5,11 @@ package meta import ( "context" - "iter" "log/slog" - "slices" "sync" "time" attr "go.opentelemetry.io/obi/pkg/export/attributes/names" - "go.opentelemetry.io/obi/pkg/internal/helpers/iters" ) func nslog() *slog.Logger { @@ -37,10 +34,10 @@ const ( // because this would mean that OBI is not being executed in that cloud provider. // But we can retry if the cloud API endpoint returns 5xx errors, as this would indicate // a temporary unavailability in the Cloud Metadata sevice. -type fetcher func(ctx context.Context) (iter.Seq2[attr.Name, string], error) +type fetcher func(ctx context.Context) ([]Entry, error) type NodeStore struct { - entries []Entry + Metadata []Entry } type Entry struct { @@ -50,10 +47,11 @@ type Entry struct { func NewNodeStore( ctx context.Context, - fetchers ...fetcher, ) *NodeStore { return &NodeStore{ - entries: fetchEntries(ctx, fetchers...), + Metadata: fetchEntries(ctx, + awsNodeFetcher, + ), } } @@ -66,7 +64,7 @@ func fetchEntries( // we run in parallel to avoid that timeouts/retries delay the startup too much // but we want to keep the priority of the fetchers, so later fetchers can override // some data from previous fetchers - results := make([]iter.Seq2[attr.Name, string], len(fetchers)) + results := make([][]Entry, len(fetchers)) for i, fetch := range fetchers { wg.Go(func() { results[i] = backoffFetch(ctx, fetch, log.With("fetcher", i)) @@ -74,38 +72,49 @@ func fetchEntries( } wg.Wait() - jointResults := iters.Concat2(results...) - resultsAsEntry := iters.Map2Seq(jointResults, - func(k attr.Name, v string) Entry { return Entry{Key: k, Value: v} }) - return slices.Collect(resultsAsEntry) + // Concatenate all results maintaining order + var allEntries []Entry + for _, entries := range results { + allEntries = append(allEntries, entries...) + } + return dedupeKeys(allEntries) } -func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) iter.Seq2[attr.Name, string] { +func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) []Entry { backoff := retryStartInterval start := time.Now() for { - seq, err := fetch(ctx) + entries, err := fetch(ctx) if err == nil { - return seq + return entries } // exponential backoff retry strategy if time.Since(start) > retryTimeout { log.Warn("timeout reached while looking for metadata. Giving up", "error", err) - return iters.Empty2[attr.Name, string]() + return nil } - log.Debug("can't fetch metadata. Will retry", - "retryAfter", backoff, "error", err) + log.Debug("can't fetch metadata. Will retry", "retryAfter", backoff, "error", err) select { case <-time.After(backoff): // continue loop! case <-ctx.Done(): log.Debug("context canceled. Exiting") - return iters.Empty2[attr.Name, string]() + return nil } backoff = min(backoff*2, retryMaxInterval) } } -func (sg *NodeStore) Get() iter.Seq[Entry] { - return slices.Values(sg.entries) +func dedupeKeys(entries []Entry) []Entry { + keyPos := map[attr.Name]int{} + out := make([]Entry, 0, len(entries)) + for _, entry := range entries { + if pos, ok := keyPos[entry.Key]; ok { + out[pos] = entry + } else { + out = append(out, entry) + keyPos[entry.Key] = len(out) - 1 + } + } + return out } diff --git a/pkg/appolly/meta/meta_node_aws.go b/pkg/appolly/meta/meta_node_aws.go new file mode 100644 index 0000000000..308d448229 --- /dev/null +++ b/pkg/appolly/meta/meta_node_aws.go @@ -0,0 +1,75 @@ +package meta + +import ( + "context" + "io" + "log/slog" + + "github.com/aws/aws-sdk-go-v2/feature/ec2/imds" + attr "go.opentelemetry.io/obi/pkg/export/attributes/names" +) + +func awsNodeFetcher(ctx context.Context) ([]Entry, error) { + log := slog.With("component", "meta.NodeStore.awsNodeFetcher") + + // Create IMDS client with default options + // The client will use IMDSv2 by default with a 5-second timeout + client := imds.New(imds.Options{}) + + // Helper function to get metadata from a path + getMetadata := func(path string) (string, error) { + output, err := client.GetMetadata(ctx, &imds.GetMetadataInput{ + Path: path, + }) + if err != nil { + return "", err + } + defer output.Content.Close() + + data, err := io.ReadAll(output.Content) + if err != nil { + return "", err + } + return string(data), nil + } + + // Try to get instance ID first to check if we're on AWS EC2 + // If this fails, we're likely not on AWS, so return empty without error + // (no point in retrying if IMDS is not available) + instanceID, err := getMetadata("instance-id") + if err != nil { + // Not on AWS EC2 - return empty slice without error + // This prevents unnecessary retries when running on baremetal, GCP, etc. + log.Debug("not on AWS EC2", "error", err) + return nil, nil + } + + // Collect all available host metadata attributes + // Following OpenTelemetry semantic conventions for host resources: + // https://opentelemetry.io/docs/specs/semconv/resource/host/ + var entries []Entry + + // host.id - unique host identifier (instance ID in AWS) + entries = append(entries, Entry{Key: attr.HostID, Value: instanceID}) + + // host.type - machine type (e.g., t3.micro, m5.large) + if instanceType, err := getMetadata("instance-type"); err == nil { + entries = append(entries, Entry{Key: attr.HostType, Value: instanceType}) + } else { + log.Debug("failed to get instance type", "error", err) + } + + // host.name - hostname + if hostname, err := getMetadata("hostname"); err == nil { + entries = append(entries, Entry{Key: attr.HostName, Value: hostname}) + } else { + log.Debug("failed to get hostname", "error", err) + } + + //TODO + //attr.HostImageID + //attr.HostImageName + //attr.HostImageVersion + + return entries, nil +} diff --git a/pkg/appolly/meta/meta_node_test.go b/pkg/appolly/meta/meta_node_test.go index c1ee43c1da..c1591271da 100644 --- a/pkg/appolly/meta/meta_node_test.go +++ b/pkg/appolly/meta/meta_node_test.go @@ -6,28 +6,27 @@ package meta import ( "context" "errors" - "iter" "sync/atomic" "testing" "testing/synctest" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" attr "go.opentelemetry.io/obi/pkg/export/attributes/names" ) func TestFetchEntries_RetryAndKeepOrder(t *testing.T) { synctest.Test(t, func(t *testing.T) { - ctx := context.Background() // Create fetchers that fail different numbers of times before succeeding failOnce := makeFetcherThatFailsNTimes(1, "fetcher1", "value1") - alwaysFails := func(ctx context.Context) (iter.Seq2[attr.Name, string], error) { + alwaysFails := func(ctx context.Context) ([]Entry, error) { return nil, errors.New("permanent failure") } failTwice := makeFetcherThatFailsNTimes(2, "fetcher2", "value2") succeedImmediately := makeFetcherThatFailsNTimes(0, "fetcher3", "value3") - entries := fetchEntries(ctx, failOnce, alwaysFails, failTwice, succeedImmediately) + entries := fetchEntries(t.Context(), failOnce, alwaysFails, failTwice, succeedImmediately) // All fetchers should eventually succeed and return their data require.Equal(t, []Entry{ @@ -39,20 +38,44 @@ func TestFetchEntries_RetryAndKeepOrder(t *testing.T) { }) } +func TestFetchEntries_DeduplicateByPriority(t *testing.T) { + entries := fetchEntries(t.Context(), + // lowest-priority fetcher + func(ctx context.Context) ([]Entry, error) { + return []Entry{ + {Key: "some.local.stuff", Value: "something"}, + {Key: "host.id", Value: "should-be-overridden"}, + {Key: "host.name", Value: "foo-hostname"}, + }, nil + }, + // highest-priority fetcher + func(ctx context.Context) ([]Entry, error) { + return []Entry{ + {Key: "foo", Value: "bar"}, + {Key: "host.id", Value: "vm-01234567"}, + {Key: "baz", Value: "bae"}, + }, nil + }, + ) + assert.Equal(t, []Entry{ + {Key: "some.local.stuff", Value: "something"}, + {Key: "host.id", Value: "vm-01234567"}, + {Key: "host.name", Value: "foo-hostname"}, + {Key: "foo", Value: "bar"}, + {Key: "baz", Value: "bae"}, + }, entries) +} + func makeFetcherThatFailsNTimes(failCount int, key, value string) fetcher { attempts := atomic.Int32{} - return func(ctx context.Context) (iter.Seq2[attr.Name, string], error) { + return func(ctx context.Context) ([]Entry, error) { attempt := attempts.Add(1) if attempt <= int32(failCount) { return nil, errors.New("simulated failure") } - return seq(key, value), nil - } -} - -func seq(key, value string) iter.Seq2[attr.Name, string] { - return func(yield func(attr.Name, string) bool) { - yield(attr.Name(key+"_1"), value+"_1") - yield(attr.Name(key+"_2"), value+"_2") + return []Entry{ + {Key: attr.Name(key + "_1"), Value: value + "_1"}, + {Key: attr.Name(key + "_2"), Value: value + "_2"}, + }, nil } } diff --git a/pkg/export/attributes/names/attrs.go b/pkg/export/attributes/names/attrs.go index 129abfb777..47a9621882 100644 --- a/pkg/export/attributes/names/attrs.go +++ b/pkg/export/attributes/names/attrs.go @@ -163,8 +163,12 @@ const ( ServiceName = Name(semconv.ServiceNameKey) ServiceNamespace = Name(semconv.ServiceNamespaceKey) - HostName = Name(semconv.HostNameKey) - HostID = Name(semconv.HostIDKey) + HostID = Name(semconv.HostIDKey) + HostImageID = Name(semconv.HostImageIDKey) + HostImageName = Name(semconv.HostImageNameKey) + HostImageVersion = Name(semconv.HostImageVersionKey) + HostName = Name(semconv.HostNameKey) + HostType = Name(semconv.HostTypeKey) ServiceInstanceID = Name(semconv.ServiceInstanceIDKey) SkipSpanMetrics = Name("span.metrics.skip") diff --git a/pkg/internal/helpers/iters/iters.go b/pkg/internal/helpers/iters/iters.go deleted file mode 100644 index be38ac311c..0000000000 --- a/pkg/internal/helpers/iters/iters.go +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright The OpenTelemetry Authors -// SPDX-License-Identifier: Apache-2.0 - -// Package iters provides some helper functions for confortably working with iter.Seq -// and iter.Seq2. -// The code is copied from https://github.com/mariomac/iters, but we only keep there -// the functions we need to minimize external dependencies, in number and surface. - -package iters // import "go.opentelemetry.io/obi/pkg/internal/helpers/iters" - -import "iter" - -// Empty2 returns an empty iter.Seq2 -func Empty2[T1, T2 any]() iter.Seq2[T1, T2] { - return func(_ func(T1, T2) bool) {} -} - -// Concat2 creates a lazily concatenated iter.Seq2 whose elements are all the elements of the first -// provided iter.Seq2 followed by all the elements of the second provided iter.Seq2, followed by the -// elements of the third iter.Seq2 (if any), and so on. -func Concat2[K, V any](seqs ...iter.Seq2[K, V]) iter.Seq2[K, V] { - return func(yield func(K, V) bool) { - for _, seq := range seqs { - for k, v := range seq { - if !yield(k, v) { - return - } - } - } - } -} - -// Map2Seq transforms an input iter.Seq2 into an iter.Seq by applying a mapper function to each element -func Map2Seq[K, V, O any](input iter.Seq2[K, V], mapper func(K, V) O) iter.Seq[O] { - return func(yield func(O) bool) { - for k, v := range input { - if !yield(mapper(k, v)) { - return - } - } - } -} diff --git a/pkg/internal/helpers/iters/iters_test.go b/pkg/internal/helpers/iters/iters_test.go deleted file mode 100644 index c08ccb1099..0000000000 --- a/pkg/internal/helpers/iters/iters_test.go +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright The OpenTelemetry Authors -// SPDX-License-Identifier: Apache-2.0 - -package iters - -import ( - "iter" - "maps" - "slices" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestEmpty2(t *testing.T) { - assert.Empty(t, maps.Collect(Empty2[int, int]())) -} - -func TestConcat2_Map2Seq(t *testing.T) { - var new = func(k []int, v []int) iter.Seq2[int, int] { - return func(yield func(int, int) bool) { - for i := 0; i < len(k) && i < len(v); i++ { - if !yield(k[i], v[i]) { - return - } - } - } - } - var kvTuple = func(k, v int) [2]int { return [2]int{k, v} } - - concat := Concat2[int, int]( - new([]int{1, 2, 3}, []int{4, 5, 6}), - new([]int{7, 8, 9}, []int{10, 11, 12}), - new([]int{13, 14, 15}, []int{16, 17, 18}), - ) - - assert.Equal(t, - [][2]int{{1, 4}, {2, 5}, {3, 6}, {7, 10}, {8, 11}, {9, 12}, {13, 16}, {14, 17}, {15, 18}}, - slices.Collect(Map2Seq(concat, kvTuple))) - // checking that sequences can be iterated multiple times - assert.Equal(t, - [][2]int{{1, 4}, {2, 5}, {3, 6}, {7, 10}, {8, 11}, {9, 12}, {13, 16}, {14, 17}, {15, 18}}, - slices.Collect(Map2Seq(concat, kvTuple))) -} From 085c5c9e65af97ecf25ef303f10166cf8554b1fe Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Fri, 13 Feb 2026 12:34:07 +0100 Subject: [PATCH 03/17] Replaced contextInfo.HostID by NodeStore. --- go.mod | 2 +- internal/test/integration/aws_test.go | 1 + pkg/appolly/meta/meta_node.go | 78 +++++++---- pkg/appolly/meta/meta_node_aws.go | 75 ---------- pkg/appolly/meta/meta_node_kube.go | 37 +++++ pkg/appolly/meta/meta_node_linux.go | 38 +++++ pkg/appolly/meta/meta_node_notlinux.go | 12 ++ pkg/appolly/meta/meta_node_otel_detector.go | 41 ++++++ pkg/appolly/meta/meta_node_test.go | 74 ++++++---- pkg/export/attributes/names/attrs.go | 10 +- pkg/pipe/global/context.go | 8 +- pkg/pipe/global/host_id.go | 145 -------------------- pkg/pipe/global/host_id_test.go | 20 --- 13 files changed, 239 insertions(+), 302 deletions(-) delete mode 100644 pkg/appolly/meta/meta_node_aws.go create mode 100644 pkg/appolly/meta/meta_node_kube.go create mode 100644 pkg/appolly/meta/meta_node_linux.go create mode 100644 pkg/appolly/meta/meta_node_notlinux.go create mode 100644 pkg/appolly/meta/meta_node_otel_detector.go delete mode 100644 pkg/pipe/global/host_id.go delete mode 100644 pkg/pipe/global/host_id_test.go diff --git a/go.mod b/go.mod index 56d56e4e4e..91af101767 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,6 @@ go 1.25.6 require ( github.com/AlessandroPomponio/go-gibberish v0.0.0-20191004143433-a2d4156f0396 - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.15 github.com/caarlos0/env/v9 v9.0.0 github.com/cilium/ebpf v0.20.0 github.com/containers/common v0.64.2 @@ -99,6 +98,7 @@ require ( github.com/aws/aws-sdk-go-v2 v1.40.1 // indirect github.com/aws/aws-sdk-go-v2/config v1.32.3 // indirect github.com/aws/aws-sdk-go-v2/credentials v1.19.3 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.15 // indirect github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.15 // indirect github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.15 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect diff --git a/internal/test/integration/aws_test.go b/internal/test/integration/aws_test.go index 21362d9bd2..c2d53ba5e8 100644 --- a/internal/test/integration/aws_test.go +++ b/internal/test/integration/aws_test.go @@ -12,6 +12,7 @@ import ( "github.com/ory/dockertest/v3/docker" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.opentelemetry.io/obi/internal/test/integration/components/promtest" ti "go.opentelemetry.io/obi/pkg/test/integration" ) diff --git a/pkg/appolly/meta/meta_node.go b/pkg/appolly/meta/meta_node.go index 0e424e9577..a472077376 100644 --- a/pkg/appolly/meta/meta_node.go +++ b/pkg/appolly/meta/meta_node.go @@ -6,10 +6,17 @@ package meta import ( "context" "log/slog" + "slices" + "strings" "sync" "time" + "go.opentelemetry.io/contrib/detectors/aws/ec2/v2" + "go.opentelemetry.io/contrib/detectors/azure/azurevm" + "go.opentelemetry.io/contrib/detectors/gcp" + attr "go.opentelemetry.io/obi/pkg/export/attributes/names" + "go.opentelemetry.io/obi/pkg/kube" ) func nslog() *slog.Logger { @@ -34,9 +41,12 @@ const ( // because this would mean that OBI is not being executed in that cloud provider. // But we can retry if the cloud API endpoint returns 5xx errors, as this would indicate // a temporary unavailability in the Cloud Metadata sevice. -type fetcher func(ctx context.Context) ([]Entry, error) +type fetcher func(ctx context.Context) (NodeStore, error) type NodeStore struct { + // HostID is a special attribute that needs to be frequently accessed + // so it's stored separately from the rest of metadata entries + HostID string Metadata []Entry } @@ -47,24 +57,30 @@ type Entry struct { func NewNodeStore( ctx context.Context, -) *NodeStore { - return &NodeStore{ - Metadata: fetchEntries(ctx, - awsNodeFetcher, - ), - } + kubeInformer *kube.MetadataProvider, +) NodeStore { + return fetchEntries(ctx, + // some fetchers will only retrieve the host name while others + // will retrieve also host attributes that will be merged + // in order of the priority below (the later the highest) + linuxLocalFetcher, + kubeNodeFetcher(kubeInformer), + otelNodeFetcher(azurevm.New()), + otelNodeFetcher(gcp.NewDetector()), + otelNodeFetcher(ec2.NewResourceDetector()), + ) } func fetchEntries( ctx context.Context, fetchers ...fetcher, -) []Entry { +) NodeStore { log := nslog() wg := sync.WaitGroup{} // we run in parallel to avoid that timeouts/retries delay the startup too much // but we want to keep the priority of the fetchers, so later fetchers can override // some data from previous fetchers - results := make([][]Entry, len(fetchers)) + results := make([]NodeStore, len(fetchers)) for i, fetch := range fetchers { wg.Go(func() { results[i] = backoffFetch(ctx, fetch, log.With("fetcher", i)) @@ -72,15 +88,21 @@ func fetchEntries( } wg.Wait() - // Concatenate all results maintaining order - var allEntries []Entry - for _, entries := range results { - allEntries = append(allEntries, entries...) + // Merge all results maintaining priority + merged := NodeStore{} + for _, store := range results { + merged.merge(store) } - return dedupeKeys(allEntries) + + // for consistency, sort alphabetically by attribute + slices.SortFunc(merged.Metadata, func(l, r Entry) int { + return strings.Compare(string(l.Key), string(r.Key)) + }) + + return merged } -func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) []Entry { +func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) NodeStore { backoff := retryStartInterval start := time.Now() for { @@ -91,7 +113,7 @@ func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) []Entry // exponential backoff retry strategy if time.Since(start) > retryTimeout { log.Warn("timeout reached while looking for metadata. Giving up", "error", err) - return nil + return NodeStore{} } log.Debug("can't fetch metadata. Will retry", "retryAfter", backoff, "error", err) select { @@ -99,22 +121,30 @@ func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) []Entry // continue loop! case <-ctx.Done(): log.Debug("context canceled. Exiting") - return nil + return NodeStore{} } backoff = min(backoff*2, retryMaxInterval) } } -func dedupeKeys(entries []Entry) []Entry { +// merges the attributes. On collision, the src NodeStore will overwrite +// the target NodeStore +func (ns *NodeStore) merge(src NodeStore) { + if src.HostID != "" { + ns.HostID = src.HostID + } keyPos := map[attr.Name]int{} - out := make([]Entry, 0, len(entries)) - for _, entry := range entries { + for i, att := range ns.Metadata { + keyPos[att.Key] = i + } + for _, entry := range src.Metadata { if pos, ok := keyPos[entry.Key]; ok { - out[pos] = entry + // Key is already in destination: overwrite + ns.Metadata[pos] = entry } else { - out = append(out, entry) - keyPos[entry.Key] = len(out) - 1 + ns.Metadata = append(ns.Metadata, entry) + // theoretically should not be necessary unless src has duplicate Keys + keyPos[entry.Key] = len(ns.Metadata) - 1 } } - return out } diff --git a/pkg/appolly/meta/meta_node_aws.go b/pkg/appolly/meta/meta_node_aws.go deleted file mode 100644 index 308d448229..0000000000 --- a/pkg/appolly/meta/meta_node_aws.go +++ /dev/null @@ -1,75 +0,0 @@ -package meta - -import ( - "context" - "io" - "log/slog" - - "github.com/aws/aws-sdk-go-v2/feature/ec2/imds" - attr "go.opentelemetry.io/obi/pkg/export/attributes/names" -) - -func awsNodeFetcher(ctx context.Context) ([]Entry, error) { - log := slog.With("component", "meta.NodeStore.awsNodeFetcher") - - // Create IMDS client with default options - // The client will use IMDSv2 by default with a 5-second timeout - client := imds.New(imds.Options{}) - - // Helper function to get metadata from a path - getMetadata := func(path string) (string, error) { - output, err := client.GetMetadata(ctx, &imds.GetMetadataInput{ - Path: path, - }) - if err != nil { - return "", err - } - defer output.Content.Close() - - data, err := io.ReadAll(output.Content) - if err != nil { - return "", err - } - return string(data), nil - } - - // Try to get instance ID first to check if we're on AWS EC2 - // If this fails, we're likely not on AWS, so return empty without error - // (no point in retrying if IMDS is not available) - instanceID, err := getMetadata("instance-id") - if err != nil { - // Not on AWS EC2 - return empty slice without error - // This prevents unnecessary retries when running on baremetal, GCP, etc. - log.Debug("not on AWS EC2", "error", err) - return nil, nil - } - - // Collect all available host metadata attributes - // Following OpenTelemetry semantic conventions for host resources: - // https://opentelemetry.io/docs/specs/semconv/resource/host/ - var entries []Entry - - // host.id - unique host identifier (instance ID in AWS) - entries = append(entries, Entry{Key: attr.HostID, Value: instanceID}) - - // host.type - machine type (e.g., t3.micro, m5.large) - if instanceType, err := getMetadata("instance-type"); err == nil { - entries = append(entries, Entry{Key: attr.HostType, Value: instanceType}) - } else { - log.Debug("failed to get instance type", "error", err) - } - - // host.name - hostname - if hostname, err := getMetadata("hostname"); err == nil { - entries = append(entries, Entry{Key: attr.HostName, Value: hostname}) - } else { - log.Debug("failed to get hostname", "error", err) - } - - //TODO - //attr.HostImageID - //attr.HostImageName - //attr.HostImageVersion - - return entries, nil -} diff --git a/pkg/appolly/meta/meta_node_kube.go b/pkg/appolly/meta/meta_node_kube.go new file mode 100644 index 0000000000..7f727b1c37 --- /dev/null +++ b/pkg/appolly/meta/meta_node_kube.go @@ -0,0 +1,37 @@ +package meta + +import ( + "context" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "go.opentelemetry.io/obi/pkg/kube" +) + +func kubeNodeFetcher(k8sInformer *kube.MetadataProvider) fetcher { + return func(ctx context.Context) (NodeStore, error) { + if !k8sInformer.IsKubeEnabled() { + return NodeStore{}, nil + } + nodeName, err := k8sInformer.CurrentNodeName(ctx) + if err != nil { + // forwarding an error will force the NodeStore to + // retry until timeout + return NodeStore{}, err + } + kubeClient, err := k8sInformer.KubeClient() + if err != nil { + return NodeStore{}, err + } + nodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{ + FieldSelector: "metadata.name=" + nodeName, + }) + if err != nil || len(nodes.Items) == 0 { + return NodeStore{}, fmt.Errorf("can't get node %s: %w", nodeName, err) + } + return NodeStore{ + HostID: nodes.Items[0].Status.NodeInfo.MachineID, + }, nil + } +} diff --git a/pkg/appolly/meta/meta_node_linux.go b/pkg/appolly/meta/meta_node_linux.go new file mode 100644 index 0000000000..c818e4928a --- /dev/null +++ b/pkg/appolly/meta/meta_node_linux.go @@ -0,0 +1,38 @@ +package meta + +import ( + "bytes" + "context" + "fmt" + "log/slog" + "os" +) + +func linuxLocalFetcher(ctx context.Context) (NodeStore, error) { + mid, err := fetchMachineID() + if err != nil { + // If we can't read host ID, we don't retry as it is mostly + // (1) this linux distribution does not have the files where we are supposing + // (2) there is some unrecoverable disk error + // (3) we lack permissions + // Then in this case, we only log a debug message + slog.Debug("can't get local machine ID", + "component", "meta.linuxLocalFetcher", + "error", err) + } + return NodeStore{ + HostID: mid, + }, nil +} + +func fetchMachineID() (string, error) { + if result, err := os.ReadFile("/etc/machine-id"); err == nil && len(bytes.TrimSpace(result)) > 0 { + return string(bytes.TrimSpace(result)), nil + } + + if result, err := os.ReadFile("/var/lib/dbus/machine-id"); err == nil && len(bytes.TrimSpace(result)) > 0 { + return string(bytes.TrimSpace(result)), nil + } else { + return "", fmt.Errorf("can't read host ID: %w", err) + } +} diff --git a/pkg/appolly/meta/meta_node_notlinux.go b/pkg/appolly/meta/meta_node_notlinux.go new file mode 100644 index 0000000000..e2708d7ec6 --- /dev/null +++ b/pkg/appolly/meta/meta_node_notlinux.go @@ -0,0 +1,12 @@ +//go:build !linux + +package meta + +import ( + "context" +) + +// permits compilation in non-linux environments +func linuxLocalFetcher(ctx context.Context) (NodeStore, error) { + return NodeStore{}, nil +} diff --git a/pkg/appolly/meta/meta_node_otel_detector.go b/pkg/appolly/meta/meta_node_otel_detector.go new file mode 100644 index 0000000000..7b7e15f01b --- /dev/null +++ b/pkg/appolly/meta/meta_node_otel_detector.go @@ -0,0 +1,41 @@ +package meta + +import ( + "context" + "fmt" + "log/slog" + + "go.opentelemetry.io/otel/sdk/resource" + semconv "go.opentelemetry.io/otel/semconv/v1.38.0" + + attr "go.opentelemetry.io/obi/pkg/export/attributes/names" +) + +func otelNodeFetcher(detector resource.Detector) fetcher { + log := slog.With("component", "meta.NodeStore.otelNodeFetcher", + "detector", fmt.Sprintf("%T", detector)[1:]) + + return func(ctx context.Context) (NodeStore, error) { + resource, err := detector.Detect(ctx) + // none of the errors from the ec2 detect are retriable, so we just log them. + if err != nil { + log.Debug("failed to detect AWS EC2 metadata", "error", err) + } + if resource == nil { + return NodeStore{}, nil + } + // In any case, the API can return an error with a valid (partial resource) + attrs := resource.Iter() + store := NodeStore{Metadata: make([]Entry, 0, attrs.Len())} + for attrs.Next() { + at := attrs.Attribute() + if at.Key == semconv.HostIDKey { + store.HostID = at.Value.Emit() + } else { + store.Metadata = append(store.Metadata, + Entry{Key: attr.Name(at.Key), Value: at.Value.Emit()}) + } + } + return store, nil + } +} diff --git a/pkg/appolly/meta/meta_node_test.go b/pkg/appolly/meta/meta_node_test.go index c1591271da..a1970abf11 100644 --- a/pkg/appolly/meta/meta_node_test.go +++ b/pkg/appolly/meta/meta_node_test.go @@ -12,16 +12,16 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + attr "go.opentelemetry.io/obi/pkg/export/attributes/names" ) func TestFetchEntries_RetryAndKeepOrder(t *testing.T) { synctest.Test(t, func(t *testing.T) { - // Create fetchers that fail different numbers of times before succeeding failOnce := makeFetcherThatFailsNTimes(1, "fetcher1", "value1") - alwaysFails := func(ctx context.Context) ([]Entry, error) { - return nil, errors.New("permanent failure") + alwaysFails := func(ctx context.Context) (NodeStore, error) { + return NodeStore{}, errors.New("permanent failure") } failTwice := makeFetcherThatFailsNTimes(2, "fetcher2", "value2") succeedImmediately := makeFetcherThatFailsNTimes(0, "fetcher3", "value3") @@ -29,10 +29,16 @@ func TestFetchEntries_RetryAndKeepOrder(t *testing.T) { entries := fetchEntries(t.Context(), failOnce, alwaysFails, failTwice, succeedImmediately) // All fetchers should eventually succeed and return their data - require.Equal(t, []Entry{ - {Key: "fetcher1_1", Value: "value1_1"}, {Key: "fetcher1_2", Value: "value1_2"}, - {Key: "fetcher2_1", Value: "value2_1"}, {Key: "fetcher2_2", Value: "value2_2"}, - {Key: "fetcher3_1", Value: "value3_1"}, {Key: "fetcher3_2", Value: "value3_2"}, + require.Equal(t, NodeStore{ + HostID: "host_fetcher3", + Metadata: []Entry{ + {Key: "fetcher1_1", Value: "value1_1"}, + {Key: "fetcher1_2", Value: "value1_2"}, + {Key: "fetcher2_1", Value: "value2_1"}, + {Key: "fetcher2_2", Value: "value2_2"}, + {Key: "fetcher3_1", Value: "value3_1"}, + {Key: "fetcher3_2", Value: "value3_2"}, + }, }, entries) synctest.Wait() }) @@ -41,41 +47,53 @@ func TestFetchEntries_RetryAndKeepOrder(t *testing.T) { func TestFetchEntries_DeduplicateByPriority(t *testing.T) { entries := fetchEntries(t.Context(), // lowest-priority fetcher - func(ctx context.Context) ([]Entry, error) { - return []Entry{ - {Key: "some.local.stuff", Value: "something"}, - {Key: "host.id", Value: "should-be-overridden"}, - {Key: "host.name", Value: "foo-hostname"}, + func(ctx context.Context) (NodeStore, error) { + return NodeStore{ + HostID: "should-be-overridden", + Metadata: []Entry{ + {Key: "some.local.stuff", Value: "something"}, + {Key: "cloud.stuff", Value: "should-be-overridden"}, + {Key: "host.name", Value: "foo-hostname"}, + }, }, nil }, // highest-priority fetcher - func(ctx context.Context) ([]Entry, error) { - return []Entry{ - {Key: "foo", Value: "bar"}, - {Key: "host.id", Value: "vm-01234567"}, - {Key: "baz", Value: "bae"}, + func(ctx context.Context) (NodeStore, error) { + return NodeStore{ + HostID: "vm-01234567", + Metadata: []Entry{ + {Key: "foo", Value: "bar"}, + {Key: "cloud.stuff", Value: "the-cloud-stuff"}, + {Key: "baz", Value: "bae"}, + }, }, nil }, ) - assert.Equal(t, []Entry{ - {Key: "some.local.stuff", Value: "something"}, - {Key: "host.id", Value: "vm-01234567"}, - {Key: "host.name", Value: "foo-hostname"}, - {Key: "foo", Value: "bar"}, - {Key: "baz", Value: "bae"}, + assert.Equal(t, NodeStore{ + HostID: "vm-01234567", + Metadata: []Entry{ + {Key: "baz", Value: "bae"}, + {Key: "cloud.stuff", Value: "the-cloud-stuff"}, + {Key: "foo", Value: "bar"}, + {Key: "host.name", Value: "foo-hostname"}, + {Key: "some.local.stuff", Value: "something"}, + }, }, entries) } func makeFetcherThatFailsNTimes(failCount int, key, value string) fetcher { attempts := atomic.Int32{} - return func(ctx context.Context) ([]Entry, error) { + return func(ctx context.Context) (NodeStore, error) { attempt := attempts.Add(1) if attempt <= int32(failCount) { - return nil, errors.New("simulated failure") + return NodeStore{}, errors.New("simulated failure") } - return []Entry{ - {Key: attr.Name(key + "_1"), Value: value + "_1"}, - {Key: attr.Name(key + "_2"), Value: value + "_2"}, + return NodeStore{ + HostID: "host_" + key, + Metadata: []Entry{ + {Key: attr.Name(key + "_1"), Value: value + "_1"}, + {Key: attr.Name(key + "_2"), Value: value + "_2"}, + }, }, nil } } diff --git a/pkg/export/attributes/names/attrs.go b/pkg/export/attributes/names/attrs.go index 47a9621882..761294d079 100644 --- a/pkg/export/attributes/names/attrs.go +++ b/pkg/export/attributes/names/attrs.go @@ -163,12 +163,10 @@ const ( ServiceName = Name(semconv.ServiceNameKey) ServiceNamespace = Name(semconv.ServiceNamespaceKey) - HostID = Name(semconv.HostIDKey) - HostImageID = Name(semconv.HostImageIDKey) - HostImageName = Name(semconv.HostImageNameKey) - HostImageVersion = Name(semconv.HostImageVersionKey) - HostName = Name(semconv.HostNameKey) - HostType = Name(semconv.HostTypeKey) + HostID = Name(semconv.HostIDKey) + HostImageID = Name(semconv.HostImageIDKey) + HostName = Name(semconv.HostNameKey) + HostType = Name(semconv.HostTypeKey) ServiceInstanceID = Name(semconv.ServiceInstanceIDKey) SkipSpanMetrics = Name("span.metrics.skip") diff --git a/pkg/pipe/global/context.go b/pkg/pipe/global/context.go index dcaea3fa55..a6d1e6b998 100644 --- a/pkg/pipe/global/context.go +++ b/pkg/pipe/global/context.go @@ -4,6 +4,7 @@ package global // import "go.opentelemetry.io/obi/pkg/pipe/global" import ( + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/obi/pkg/appolly/app/request" @@ -20,9 +21,10 @@ import ( // ContextInfo stores some context information that must be shared across some nodes of the // processing graph. type ContextInfo struct { - // HostID of the host running OBI. Unless testing environments, this value must be - // automatically set after invoking FetchHostID - HostID string + // NodeInfo of the node (physical, VM, cloud instance...) running OBI. + // Including the HostID and other host metadata Attributes + NodeInfo meta.NodeStore + // AppO11y stores context information that is only required for application observability. // Its values must be initialized by the App O11y code and shouldn't be accessed from the // NetO11y part. diff --git a/pkg/pipe/global/host_id.go b/pkg/pipe/global/host_id.go deleted file mode 100644 index cf48b6b861..0000000000 --- a/pkg/pipe/global/host_id.go +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright The OpenTelemetry Authors -// SPDX-License-Identifier: Apache-2.0 - -package global // import "go.opentelemetry.io/obi/pkg/pipe/global" - -import ( - "bytes" - "context" - "errors" - "fmt" - "log/slog" - "os" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - "go.opentelemetry.io/contrib/detectors/aws/ec2/v2" - "go.opentelemetry.io/contrib/detectors/azure/azurevm" - "go.opentelemetry.io/contrib/detectors/gcp" - "go.opentelemetry.io/otel/sdk/resource" - semconv "go.opentelemetry.io/otel/semconv/v1.38.0" -) - -type hostIDFetcher func(context.Context, time.Duration) (string, error) - -type fetcher struct { - name string - fetch hostIDFetcher -} - -func cilog() *slog.Logger { - return slog.With("component", "ContextInfo") -} - -// FetchHostID tries to get the host ID from one of the following sources, by priority -// 1. If Beyla runs in AWS, GCP or Azure, it will take the instance ID -// 2. Otherwise, will try to read the Kubernetes Node MachineID field -// 3. Otherwise, will try to read the machine ID from the local OS filesystem -// 4. Otherwise, will fallback to the Hostname -// This process is known to fail when Beyla runs inside a Kubernetes Pod out of the cloud providers -// mentioned in (1). In that case, the host.id will be later set to the full hostname. -// This method must be invoked once the ContextInfo object is completely initialized -func (ci *ContextInfo) FetchHostID(ctx context.Context, timeout time.Duration) { - log := cilog().With("func", "fetchHostID") - fetchers := []fetcher{ - {name: "AWS", fetch: ec2HostIDFetcher}, - {name: "Azure", fetch: azureHostIDFetcher}, - {name: "GCP", fetch: gcpHostIDFetcher}, - {name: "KubeNode", fetch: ci.kubeNodeFetcher}, - {name: "local", fetch: linuxLocalMachineIDFetcher}, - } - // if all the methods fail, keep at least the fallback method error - var err error - for _, f := range fetchers { - log := log.With("fetcher", f.name) - log.Debug("trying to fetch host ID") - var id string - if id, err = f.fetch(ctx, timeout); err == nil { - log.Info("got host ID", "hostID", id) - ci.HostID = id - return - } - log.Debug("didn't get host ID", "cause", err) - } - log.Debug("falling back to local host ID. This might be inaccurate in containerized systems") - ci.HostID, err = os.Hostname() - if err != nil { - log.Warn("getting host ID from host name", "error", err) - } -} - -func azureHostIDFetcher(ctx context.Context, timeout time.Duration) (string, error) { - return detectHostID(ctx, timeout, azurevm.New()) -} - -func gcpHostIDFetcher(ctx context.Context, timeout time.Duration) (string, error) { - return detectHostID(ctx, timeout, gcp.NewDetector()) -} - -func ec2HostIDFetcher(ctx context.Context, timeout time.Duration) (string, error) { - return detectHostID(ctx, timeout, ec2.NewResourceDetector()) -} - -func detectHostID(ctx context.Context, timeout time.Duration, detector resource.Detector) (string, error) { - // passing a cancellable context to the detector.Detect(ctx) does not always - // end the connection prematurely, so we wrap its invocation into a goroutine - cctx, cancel := context.WithTimeout(ctx, timeout) - defer cancel() - resCh := make(chan *resource.Resource, 1) - errCh := make(chan error, 1) - go func() { - if res, err := detector.Detect(ctx); err != nil { - errCh <- err - } else { - resCh <- res - } - }() - var res *resource.Resource - select { - case res = <-resCh: // continue! - case err := <-errCh: - return "", err - case <-cctx.Done(): - return "", errors.New("timed out waiting for host ID connection") - } - for _, attr := range res.Attributes() { - if attr.Key == semconv.HostIDKey { - return attr.Value.Emit(), nil - } - } - return "", fmt.Errorf("can't find host.id in %v", res.Attributes()) -} - -func (ci *ContextInfo) kubeNodeFetcher(ctx context.Context, _ time.Duration) (string, error) { - if ci.K8sInformer == nil || !ci.K8sInformer.IsKubeEnabled() { - return "", errors.New("kubernetes is not enabled") - } - nodeName, err := ci.K8sInformer.CurrentNodeName(ctx) - if err != nil { - return "", fmt.Errorf("can't get node name: %w", err) - } - kubeClient, err := ci.K8sInformer.KubeClient() - if err != nil { - return "", fmt.Errorf("can't get kubernetes client: %w", err) - } - nodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{ - FieldSelector: "metadata.name=" + nodeName, - }) - if err != nil || len(nodes.Items) == 0 { - return "", fmt.Errorf("can't get node %s: %w", nodeName, err) - } - return nodes.Items[0].Status.NodeInfo.MachineID, nil -} - -func linuxLocalMachineIDFetcher(_ context.Context, _ time.Duration) (string, error) { - if result, err := os.ReadFile("/etc/machine-id"); err == nil && len(bytes.TrimSpace(result)) > 0 { - return string(bytes.TrimSpace(result)), nil - } - - if result, err := os.ReadFile("/var/lib/dbus/machine-id"); err == nil && len(bytes.TrimSpace(result)) > 0 { - return string(bytes.TrimSpace(result)), nil - } else { - return "", fmt.Errorf("can't read host ID: %w", err) - } -} diff --git a/pkg/pipe/global/host_id_test.go b/pkg/pipe/global/host_id_test.go deleted file mode 100644 index e0fd069bec..0000000000 --- a/pkg/pipe/global/host_id_test.go +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright The OpenTelemetry Authors -// SPDX-License-Identifier: Apache-2.0 - -package global - -import ( - "testing" - "time" - - "github.com/stretchr/testify/assert" -) - -func TestFetchTimeout(t *testing.T) { - ctxInfo := ContextInfo{} - start := time.Now() - ctxInfo.FetchHostID(t.Context(), time.Millisecond) - elapsed := time.Since(start) - - assert.Less(t, elapsed, time.Second) -} From 95e17446515abe634e3d89367e9f93426afce712 Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Fri, 13 Feb 2026 12:54:43 +0100 Subject: [PATCH 04/17] add license headers --- pkg/appolly/meta/meta_node.go | 2 +- pkg/appolly/meta/meta_node_kube.go | 5 ++++- pkg/appolly/meta/meta_node_linux.go | 5 ++++- pkg/appolly/meta/meta_node_notlinux.go | 5 ++++- pkg/appolly/meta/meta_node_otel_detector.go | 5 ++++- pkg/pipe/global/context.go | 2 +- 6 files changed, 18 insertions(+), 6 deletions(-) diff --git a/pkg/appolly/meta/meta_node.go b/pkg/appolly/meta/meta_node.go index a472077376..7e18ccc31a 100644 --- a/pkg/appolly/meta/meta_node.go +++ b/pkg/appolly/meta/meta_node.go @@ -1,7 +1,7 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 -package meta +package meta // import "go.opentelemetry.io/obi/pkg/appolly/meta" import ( "context" diff --git a/pkg/appolly/meta/meta_node_kube.go b/pkg/appolly/meta/meta_node_kube.go index 7f727b1c37..c1c88d1504 100644 --- a/pkg/appolly/meta/meta_node_kube.go +++ b/pkg/appolly/meta/meta_node_kube.go @@ -1,4 +1,7 @@ -package meta +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package meta // import "go.opentelemetry.io/obi/pkg/appolly/meta" import ( "context" diff --git a/pkg/appolly/meta/meta_node_linux.go b/pkg/appolly/meta/meta_node_linux.go index c818e4928a..f6e06c2142 100644 --- a/pkg/appolly/meta/meta_node_linux.go +++ b/pkg/appolly/meta/meta_node_linux.go @@ -1,4 +1,7 @@ -package meta +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package meta // import "go.opentelemetry.io/obi/pkg/appolly/meta" import ( "bytes" diff --git a/pkg/appolly/meta/meta_node_notlinux.go b/pkg/appolly/meta/meta_node_notlinux.go index e2708d7ec6..1634e593e1 100644 --- a/pkg/appolly/meta/meta_node_notlinux.go +++ b/pkg/appolly/meta/meta_node_notlinux.go @@ -1,6 +1,9 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + //go:build !linux -package meta +package meta // import "go.opentelemetry.io/obi/pkg/appolly/meta" import ( "context" diff --git a/pkg/appolly/meta/meta_node_otel_detector.go b/pkg/appolly/meta/meta_node_otel_detector.go index 7b7e15f01b..029c1a3491 100644 --- a/pkg/appolly/meta/meta_node_otel_detector.go +++ b/pkg/appolly/meta/meta_node_otel_detector.go @@ -1,4 +1,7 @@ -package meta +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package meta // import "go.opentelemetry.io/obi/pkg/appolly/meta" import ( "context" diff --git a/pkg/pipe/global/context.go b/pkg/pipe/global/context.go index a6d1e6b998..3b1de90da2 100644 --- a/pkg/pipe/global/context.go +++ b/pkg/pipe/global/context.go @@ -4,10 +4,10 @@ package global // import "go.opentelemetry.io/obi/pkg/pipe/global" import ( - "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/obi/pkg/appolly/app/request" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/docker" "go.opentelemetry.io/obi/pkg/export/attributes" "go.opentelemetry.io/obi/pkg/export/connector" From ec1a5b350986246b7ccc25e040cff8d10f4de7d4 Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Fri, 13 Feb 2026 14:39:43 +0100 Subject: [PATCH 05/17] Integrated within CtxInfo --- internal/test/integration/dockerutil_test.go | 2 +- pkg/appolly/instrumenter_test.go | 3 ++- pkg/appolly/meta/meta_node.go | 6 +++++- pkg/appolly/meta/meta_node_notlinux.go | 2 +- pkg/appolly/meta/meta_node_test.go | 8 ++++---- pkg/export/otel/metrics.go | 15 ++++++++------- pkg/export/otel/metrics_internal.go | 2 +- pkg/export/otel/metrics_net.go | 2 +- pkg/export/otel/metrics_svc_graph.go | 2 +- pkg/export/otel/metrics_test.go | 3 ++- pkg/export/otel/traces.go | 2 +- pkg/export/prom/prom.go | 2 +- pkg/export/prom/prom_test.go | 3 ++- pkg/instrumenter/instrumenter.go | 12 +++++++----- pkg/obi/config.go | 6 +----- pkg/obi/config_test.go | 3 +-- pkg/pipe/global/context.go | 4 ++-- 17 files changed, 41 insertions(+), 36 deletions(-) diff --git a/internal/test/integration/dockerutil_test.go b/internal/test/integration/dockerutil_test.go index b45a8a2067..1142b70df1 100644 --- a/internal/test/integration/dockerutil_test.go +++ b/internal/test/integration/dockerutil_test.go @@ -151,7 +151,7 @@ type obi struct { } // instrument starts the OBI container to instrument the target application. -func (o obi) instrument(t *testing.T, network *dockertest.Network, resource *dockertest.Resource, configFile string) { //nolint:unparam // configFile is always passed in current usages but may vary in future +func (o obi) instrument(t *testing.T, network *dockertest.Network, resource *dockertest.Resource, configFile string) { t.Helper() t.Log("Starting OBI container with PID namespace sharing...") diff --git a/pkg/appolly/instrumenter_test.go b/pkg/appolly/instrumenter_test.go index 0f385390ca..32c66def4e 100644 --- a/pkg/appolly/instrumenter_test.go +++ b/pkg/appolly/instrumenter_test.go @@ -22,6 +22,7 @@ import ( "go.opentelemetry.io/obi/pkg/appolly/app/request" "go.opentelemetry.io/obi/pkg/appolly/app/svc" "go.opentelemetry.io/obi/pkg/appolly/discover/exec" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/config" "go.opentelemetry.io/obi/pkg/export" "go.opentelemetry.io/obi/pkg/export/attributes" @@ -47,7 +48,7 @@ func gctx(groups attributes.AttrGroups, mcfg *otelcfg.MetricsConfig) *global.Con Metrics: imetrics.NoopReporter{}, MetricAttributeGroups: groups, K8sInformer: kube.NewMetadataProvider(kube.MetadataConfig{Enable: kubeflags.EnabledFalse}, imetrics.NoopReporter{}), - HostID: "host-id", + NodeMeta: meta.NodeStore{HostID: "host-id"}, OTELMetricsExporter: &otelcfg.MetricsExporterInstancer{Cfg: mcfg}, } } diff --git a/pkg/appolly/meta/meta_node.go b/pkg/appolly/meta/meta_node.go index 7e18ccc31a..93f0869391 100644 --- a/pkg/appolly/meta/meta_node.go +++ b/pkg/appolly/meta/meta_node.go @@ -40,7 +40,7 @@ const ( // For example, we must not retry if a cloud API endpoint does not exist or it returns 4xx errors, // because this would mean that OBI is not being executed in that cloud provider. // But we can retry if the cloud API endpoint returns 5xx errors, as this would indicate -// a temporary unavailability in the Cloud Metadata sevice. +// a temporary unavailability in the Cloud Metadata service. type fetcher func(ctx context.Context) (NodeStore, error) type NodeStore struct { @@ -57,6 +57,7 @@ type Entry struct { func NewNodeStore( ctx context.Context, + overrideHost string, kubeInformer *kube.MetadataProvider, ) NodeStore { return fetchEntries(ctx, @@ -68,6 +69,9 @@ func NewNodeStore( otelNodeFetcher(azurevm.New()), otelNodeFetcher(gcp.NewDetector()), otelNodeFetcher(ec2.NewResourceDetector()), + func(_ context.Context) (NodeStore, error) { + return NodeStore{HostID: overrideHost}, nil + }, ) } diff --git a/pkg/appolly/meta/meta_node_notlinux.go b/pkg/appolly/meta/meta_node_notlinux.go index 1634e593e1..671547ec5d 100644 --- a/pkg/appolly/meta/meta_node_notlinux.go +++ b/pkg/appolly/meta/meta_node_notlinux.go @@ -10,6 +10,6 @@ import ( ) // permits compilation in non-linux environments -func linuxLocalFetcher(ctx context.Context) (NodeStore, error) { +func linuxLocalFetcher(_ context.Context) (NodeStore, error) { return NodeStore{}, nil } diff --git a/pkg/appolly/meta/meta_node_test.go b/pkg/appolly/meta/meta_node_test.go index a1970abf11..57557672d4 100644 --- a/pkg/appolly/meta/meta_node_test.go +++ b/pkg/appolly/meta/meta_node_test.go @@ -20,7 +20,7 @@ func TestFetchEntries_RetryAndKeepOrder(t *testing.T) { synctest.Test(t, func(t *testing.T) { // Create fetchers that fail different numbers of times before succeeding failOnce := makeFetcherThatFailsNTimes(1, "fetcher1", "value1") - alwaysFails := func(ctx context.Context) (NodeStore, error) { + alwaysFails := func(_ context.Context) (NodeStore, error) { return NodeStore{}, errors.New("permanent failure") } failTwice := makeFetcherThatFailsNTimes(2, "fetcher2", "value2") @@ -47,7 +47,7 @@ func TestFetchEntries_RetryAndKeepOrder(t *testing.T) { func TestFetchEntries_DeduplicateByPriority(t *testing.T) { entries := fetchEntries(t.Context(), // lowest-priority fetcher - func(ctx context.Context) (NodeStore, error) { + func(_ context.Context) (NodeStore, error) { return NodeStore{ HostID: "should-be-overridden", Metadata: []Entry{ @@ -58,7 +58,7 @@ func TestFetchEntries_DeduplicateByPriority(t *testing.T) { }, nil }, // highest-priority fetcher - func(ctx context.Context) (NodeStore, error) { + func(_ context.Context) (NodeStore, error) { return NodeStore{ HostID: "vm-01234567", Metadata: []Entry{ @@ -83,7 +83,7 @@ func TestFetchEntries_DeduplicateByPriority(t *testing.T) { func makeFetcherThatFailsNTimes(failCount int, key, value string) fetcher { attempts := atomic.Int32{} - return func(ctx context.Context) (NodeStore, error) { + return func(_ context.Context) (NodeStore, error) { attempt := attempts.Add(1) if attempt <= int32(failCount) { return NodeStore{}, errors.New("simulated failure") diff --git a/pkg/export/otel/metrics.go b/pkg/export/otel/metrics.go index 471d97f1da..7b6eefd789 100644 --- a/pkg/export/otel/metrics.go +++ b/pkg/export/otel/metrics.go @@ -19,6 +19,7 @@ import ( "go.opentelemetry.io/obi/pkg/appolly/app/request" "go.opentelemetry.io/obi/pkg/appolly/app/svc" "go.opentelemetry.io/obi/pkg/appolly/discover/exec" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/export/attributes" attr "go.opentelemetry.io/obi/pkg/export/attributes/names" "go.opentelemetry.io/obi/pkg/export/imetrics" @@ -73,7 +74,7 @@ type MetricsReporter struct { ctx context.Context cfg *otelcfg.MetricsConfig jointMetricsCfg *perapp.MetricsConfig - hostID string + nodeMeta meta.NodeStore attributes *attributes.AttrSelector exporter sdkmetric.Exporter reporters otelcfg.ReporterPool[*svc.Attrs, *Metrics] @@ -215,7 +216,7 @@ func newMetricsReporter( is: is, targetMetrics: map[svc.UID]*TargetMetrics{}, attributes: attribProvider, - hostID: ctxInfo.HostID, + nodeMeta: ctxInfo.NodeMeta, input: input.Subscribe(msg.SubscriberName("otelMetrics.InputSpans")), processEvents: processEventCh.Subscribe(msg.SubscriberName("otelMetrics.ProcessEvents")), userAttribSelection: selectorCfg.SelectionCfg, @@ -637,7 +638,7 @@ func (mr *MetricsReporter) setupHostInfoMeter(meter instrument.Meter) error { attr := attributes.Field[*request.Span, attribute.KeyValue]{ ExposedName: string(GrafanaHostIDKey), Get: func(_ *request.Span) attribute.KeyValue { - return semconv.HostID(mr.hostID) + return semconv.HostID(mr.nodeMeta.HostID) }, } @@ -652,7 +653,7 @@ func (mr *MetricsReporter) newMetricsInstance(service *svc.Attrs) Metrics { var resourceAttributes []attribute.KeyValue if service != nil { mlog = mlog.With("service", service) - resourceAttributes = append(otelcfg.GetAppResourceAttrs(mr.hostID, service), otelcfg.ResourceAttrsFromEnv(service)...) + resourceAttributes = append(otelcfg.GetAppResourceAttrs(mr.nodeMeta.HostID, service), otelcfg.ResourceAttrsFromEnv(service)...) } mlog.Debug("creating new Metrics reporter") resources := resource.NewWithAttributes(semconv.SchemaURL, resourceAttributes...) @@ -786,7 +787,7 @@ func (mr *MetricsReporter) tracesResourceAttributes(service *svc.Attrs) attribut } extraAttrs := []attribute.KeyValue{ - semconv.HostID(mr.hostID), + semconv.HostID(mr.nodeMeta.HostID), } for k, v := range service.Metadata { @@ -816,7 +817,7 @@ func (mr *MetricsReporter) spanMetricAttributes() []attributes.Field[*request.Sp attributes.Field[*request.Span, attribute.KeyValue]{ ExposedName: string(attr.HostID.OTEL()), Get: func(_ *request.Span) attribute.KeyValue { - return semconv.HostID(mr.hostID) + return semconv.HostID(mr.nodeMeta.HostID) }, }) } @@ -1040,7 +1041,7 @@ func (mr *MetricsReporter) resourceAttrsForService(service *svc.Attrs) []attribu attribute.String(string(attr.Job), service.Job()), } - attrs = append(attrs, otelcfg.GetAppResourceAttrs(mr.hostID, service)...) + attrs = append(attrs, otelcfg.GetAppResourceAttrs(mr.nodeMeta.HostID, service)...) return append(attrs, otelcfg.ResourceAttrsFromEnv(service)...) } diff --git a/pkg/export/otel/metrics_internal.go b/pkg/export/otel/metrics_internal.go index 6aed4aa340..e4d7fb3348 100644 --- a/pkg/export/otel/metrics_internal.go +++ b/pkg/export/otel/metrics_internal.go @@ -56,7 +56,7 @@ func NewInternalMetricsReporter(ctx context.Context, ctxInfo *global.ContextInfo return nil, err } - res := newResourceInternal(ctxInfo.HostID) + res := newResourceInternal(ctxInfo.NodeMeta.HostID) provider := newInternalMeterProvider(res, &exporter, metrics.Interval) meter := provider.Meter("obi_internal") tracerFlushes, err := meter.Float64Histogram( diff --git a/pkg/export/otel/metrics_net.go b/pkg/export/otel/metrics_net.go index eee65f4527..08b18f6458 100644 --- a/pkg/export/otel/metrics_net.go +++ b/pkg/export/otel/metrics_net.go @@ -115,7 +115,7 @@ func newMetricsExporter( } exporter = instrumentMetricsExporter(ctxInfo.Metrics, exporter) - resource := createFilteredNetworkResource(ctxInfo.HostID, cfg.SelectorCfg.SelectionCfg) + resource := createFilteredNetworkResource(ctxInfo.NodeMeta.HostID, cfg.SelectorCfg.SelectionCfg) provider := newMeterProvider(resource, &exporter, cfg.Metrics.Interval) attrProv, err := attributes.NewAttrSelector(ctxInfo.MetricAttributeGroups, cfg.SelectorCfg) diff --git a/pkg/export/otel/metrics_svc_graph.go b/pkg/export/otel/metrics_svc_graph.go index 4edbba3707..eeb127d3df 100644 --- a/pkg/export/otel/metrics_svc_graph.go +++ b/pkg/export/otel/metrics_svc_graph.go @@ -120,7 +120,7 @@ func newSvcGraphMetricsReporter( ctx: ctx, cfg: cfg, is: is, - hostID: ctxInfo.HostID, + hostID: ctxInfo.NodeMeta.HostID, input: input.Subscribe(msg.SubscriberName("otel.SvcGraphMetricsReporter.input")), processEvents: processEventCh.Subscribe(msg.SubscriberName("otel.SvcGraphMetricsReporter.processEvents")), metricAttributes: serviceGraphGetters(unresolved, ctxInfo.K8sInformer.IsKubeEnabled()), diff --git a/pkg/export/otel/metrics_test.go b/pkg/export/otel/metrics_test.go index 646e0a955c..5ed9dea573 100644 --- a/pkg/export/otel/metrics_test.go +++ b/pkg/export/otel/metrics_test.go @@ -25,6 +25,7 @@ import ( "go.opentelemetry.io/obi/pkg/appolly/app/request" "go.opentelemetry.io/obi/pkg/appolly/app/svc" "go.opentelemetry.io/obi/pkg/appolly/discover/exec" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/export" "go.opentelemetry.io/obi/pkg/export/attributes" attr "go.opentelemetry.io/obi/pkg/export/attributes/names" @@ -808,7 +809,7 @@ func TestMetricResourceAttributes(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { mr := &MetricsReporter{ - hostID: "test-host-id", + nodeMeta: meta.NodeStore{HostID: "test-host-id"}, userAttribSelection: tc.attributeSelect, } diff --git a/pkg/export/otel/traces.go b/pkg/export/otel/traces.go index 3f6f584023..b8d62d069b 100644 --- a/pkg/export/otel/traces.go +++ b/pkg/export/otel/traces.go @@ -125,7 +125,7 @@ func (tr *tracesOTELReceiver) processSpans(ctx context.Context, exp exporter.Tra if tr.spanMetricsEnabled { envResourceAttrs = append(envResourceAttrs, attribute.Bool(string(attr.SkipSpanMetrics.OTEL()), true)) } - traces := tracesgen.GenerateTracesWithAttributes(tr.attributeCache, &sample.Span.Service, envResourceAttrs, tr.ctxInfo.HostID, spanGroup, reporterName, tr.ctxInfo.ExtraResourceAttributes...) + traces := tracesgen.GenerateTracesWithAttributes(tr.attributeCache, &sample.Span.Service, envResourceAttrs, tr.ctxInfo.NodeMeta.HostID, spanGroup, reporterName, tr.ctxInfo.ExtraResourceAttributes...) err := exp.ConsumeTraces(ctx, traces) if err != nil { // We can't do if errors.Is(err, queue.ErrQueueIsFull), since the queue package is internal diff --git a/pkg/export/prom/prom.go b/pkg/export/prom/prom.go index fda72c11fe..56349730a1 100644 --- a/pkg/export/prom/prom.go +++ b/pkg/export/prom/prom.go @@ -401,7 +401,7 @@ func newReporter( dockerEnabled: dockerEnabled, extraMetadataLabels: extraMetadataLabels, extraSpanMetadataLabels: extraSpanMetadataLabels, - hostID: ctxInfo.HostID, + hostID: ctxInfo.NodeMeta.HostID, clock: clock, is: is, promConnect: ctxInfo.Prometheus, diff --git a/pkg/export/prom/prom_test.go b/pkg/export/prom/prom_test.go index 40b429e9d4..20058e0e4a 100644 --- a/pkg/export/prom/prom_test.go +++ b/pkg/export/prom/prom_test.go @@ -25,6 +25,7 @@ import ( "go.opentelemetry.io/obi/pkg/appolly/app/request" "go.opentelemetry.io/obi/pkg/appolly/app/svc" "go.opentelemetry.io/obi/pkg/appolly/discover/exec" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/export" "go.opentelemetry.io/obi/pkg/export/attributes" attr "go.opentelemetry.io/obi/pkg/export/attributes/names" @@ -58,7 +59,7 @@ func TestAppMetricsExpiration(t *testing.T) { exporter, err := PrometheusEndpoint( &global.ContextInfo{ Prometheus: &connector.PrometheusManager{}, - HostID: "my-host", + NodeMeta: meta.NodeStore{HostID: "my-host"}, MetricAttributeGroups: g, }, &PrometheusConfig{ diff --git a/pkg/instrumenter/instrumenter.go b/pkg/instrumenter/instrumenter.go index 86a66d7ec0..82d312ee56 100644 --- a/pkg/instrumenter/instrumenter.go +++ b/pkg/instrumenter/instrumenter.go @@ -12,6 +12,7 @@ import ( "golang.org/x/sync/errgroup" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/docker" "go.opentelemetry.io/obi/pkg/export/attributes" "go.opentelemetry.io/obi/pkg/export/connector" @@ -172,11 +173,6 @@ func BuildCommonContextInfo( Prometheus: promMgr, OTELMetricsExporter: &otelcfg.MetricsExporterInstancer{Cfg: &config.OTELMetrics}, } - if config.Attributes.HostID.Override == "" { - ctxInfo.FetchHostID(ctx, config.Attributes.HostID.FetchTimeout) - } else { - ctxInfo.HostID = config.Attributes.HostID.Override - } ctxInfo.Metrics, err = internalMetrics(ctx, config, ctxInfo, promMgr) if err != nil { return nil, fmt.Errorf("can't create internal metrics: %w", err) @@ -194,6 +190,12 @@ func BuildCommonContextInfo( ServiceNameTemplate: templ, }, ctxInfo.Metrics) + ctxInfo.NodeMeta = meta.NewNodeStore( + ctx, + config.Attributes.HostID.Override, + ctxInfo.K8sInformer, + ) + ctxInfo.DockerMetadata = docker.NewStore() attributeGroups(config, ctxInfo) diff --git a/pkg/obi/config.go b/pkg/obi/config.go index 96d9a53f2d..f6d2526036 100644 --- a/pkg/obi/config.go +++ b/pkg/obi/config.go @@ -211,9 +211,7 @@ var DefaultConfig = Config{ InformersResyncPeriod: 30 * time.Minute, ResourceLabels: kube.DefaultResourceLabels, }, - HostID: HostIDConfig{ - FetchTimeout: 500 * time.Millisecond, - }, + HostID: HostIDConfig{}, RenameUnresolvedHosts: "unresolved", RenameUnresolvedHostsOutgoing: "outgoing", RenameUnresolvedHostsIncoming: "incoming", @@ -505,8 +503,6 @@ type Attributes struct { type HostIDConfig struct { // Override allows overriding the reported host.id in OBI Override string `yaml:"override" env:"OTEL_EBPF_HOST_ID"` - // FetchTimeout specifies the timeout for trying to fetch the HostID from diverse Cloud Providers - FetchTimeout time.Duration `yaml:"fetch_timeout" env:"OTEL_EBPF_HOST_ID_FETCH_TIMEOUT"` } type NodeJSConfig struct { diff --git a/pkg/obi/config_test.go b/pkg/obi/config_test.go index 00c41df3df..889f29b5d9 100644 --- a/pkg/obi/config_test.go +++ b/pkg/obi/config_test.go @@ -244,8 +244,7 @@ discovery: ResourceLabels: metaSources, }, HostID: HostIDConfig{ - Override: "the-host-id", - FetchTimeout: 4 * time.Second, + Override: "the-host-id", }, Select: attributes.Selection{ attributes.NetworkFlow.Section: attributes.InclusionLists{ diff --git a/pkg/pipe/global/context.go b/pkg/pipe/global/context.go index 3b1de90da2..80d9168903 100644 --- a/pkg/pipe/global/context.go +++ b/pkg/pipe/global/context.go @@ -21,9 +21,9 @@ import ( // ContextInfo stores some context information that must be shared across some nodes of the // processing graph. type ContextInfo struct { - // NodeInfo of the node (physical, VM, cloud instance...) running OBI. + // NodeMeta of the node (physical, VM, cloud instance...) running OBI. // Including the HostID and other host metadata Attributes - NodeInfo meta.NodeStore + NodeMeta meta.NodeStore // AppO11y stores context information that is only required for application observability. // Its values must be initialized by the App O11y code and shouldn't be accessed from the From 71fdd1bb76e76f03a0efbc67c523fd7873eddeb3 Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Fri, 13 Feb 2026 15:22:00 +0100 Subject: [PATCH 06/17] integrated with metrics exporters --- pkg/appolly/instrumenter_test.go | 2 +- pkg/appolly/meta/meta_node.go | 39 ++++++++------ pkg/appolly/meta/meta_node_kube.go | 14 +++--- pkg/appolly/meta/meta_node_linux.go | 4 +- pkg/appolly/meta/meta_node_notlinux.go | 4 +- pkg/appolly/meta/meta_node_otel_detector.go | 8 +-- pkg/appolly/meta/meta_node_test.go | 28 ++++++----- pkg/export/otel/metrics.go | 16 +++--- pkg/export/otel/metrics_internal.go | 11 ++-- pkg/export/otel/metrics_svc_graph.go | 9 ++-- pkg/export/otel/metrics_test.go | 2 +- pkg/export/otel/otelcfg/common.go | 13 +++-- pkg/export/otel/traces.go | 2 +- pkg/export/otel/traces_test.go | 56 +++++++++++---------- pkg/export/otel/tracesgen/tracesgen.go | 11 ++-- pkg/export/prom/prom.go | 23 ++++++--- pkg/export/prom/prom_test.go | 2 +- pkg/pipe/global/context.go | 2 +- 18 files changed, 142 insertions(+), 104 deletions(-) diff --git a/pkg/appolly/instrumenter_test.go b/pkg/appolly/instrumenter_test.go index 32c66def4e..9fd1daafe0 100644 --- a/pkg/appolly/instrumenter_test.go +++ b/pkg/appolly/instrumenter_test.go @@ -48,7 +48,7 @@ func gctx(groups attributes.AttrGroups, mcfg *otelcfg.MetricsConfig) *global.Con Metrics: imetrics.NoopReporter{}, MetricAttributeGroups: groups, K8sInformer: kube.NewMetadataProvider(kube.MetadataConfig{Enable: kubeflags.EnabledFalse}, imetrics.NoopReporter{}), - NodeMeta: meta.NodeStore{HostID: "host-id"}, + NodeMeta: meta.NodeMeta{HostID: "host-id"}, OTELMetricsExporter: &otelcfg.MetricsExporterInstancer{Cfg: mcfg}, } } diff --git a/pkg/appolly/meta/meta_node.go b/pkg/appolly/meta/meta_node.go index 93f0869391..de013ba0d8 100644 --- a/pkg/appolly/meta/meta_node.go +++ b/pkg/appolly/meta/meta_node.go @@ -20,7 +20,7 @@ import ( ) func nslog() *slog.Logger { - return slog.With("component", "meta.NodeStore") + return slog.With("component", "meta.NodeMeta") } // TODO: make configurable @@ -30,6 +30,12 @@ const ( retryMaxInterval = 5 * time.Second ) +// some attributes from the node need to be filtered out, because they are +// going to be specified for each service instance +var filterAttrs []attr.Name = []attr.Name{ + attr.HostName, +} + // host metadata is common to all the instrumented applications within a single // physical node, cloud instance or local virtual machine. // This information only needs to be retrieved once at startup, and will be @@ -41,9 +47,9 @@ const ( // because this would mean that OBI is not being executed in that cloud provider. // But we can retry if the cloud API endpoint returns 5xx errors, as this would indicate // a temporary unavailability in the Cloud Metadata service. -type fetcher func(ctx context.Context) (NodeStore, error) +type fetcher func(ctx context.Context) (NodeMeta, error) -type NodeStore struct { +type NodeMeta struct { // HostID is a special attribute that needs to be frequently accessed // so it's stored separately from the rest of metadata entries HostID string @@ -59,7 +65,7 @@ func NewNodeStore( ctx context.Context, overrideHost string, kubeInformer *kube.MetadataProvider, -) NodeStore { +) NodeMeta { return fetchEntries(ctx, // some fetchers will only retrieve the host name while others // will retrieve also host attributes that will be merged @@ -69,8 +75,8 @@ func NewNodeStore( otelNodeFetcher(azurevm.New()), otelNodeFetcher(gcp.NewDetector()), otelNodeFetcher(ec2.NewResourceDetector()), - func(_ context.Context) (NodeStore, error) { - return NodeStore{HostID: overrideHost}, nil + func(_ context.Context) (NodeMeta, error) { + return NodeMeta{HostID: overrideHost}, nil }, ) } @@ -78,13 +84,13 @@ func NewNodeStore( func fetchEntries( ctx context.Context, fetchers ...fetcher, -) NodeStore { +) NodeMeta { log := nslog() wg := sync.WaitGroup{} // we run in parallel to avoid that timeouts/retries delay the startup too much // but we want to keep the priority of the fetchers, so later fetchers can override // some data from previous fetchers - results := make([]NodeStore, len(fetchers)) + results := make([]NodeMeta, len(fetchers)) for i, fetch := range fetchers { wg.Go(func() { results[i] = backoffFetch(ctx, fetch, log.With("fetcher", i)) @@ -93,7 +99,7 @@ func fetchEntries( wg.Wait() // Merge all results maintaining priority - merged := NodeStore{} + merged := NodeMeta{} for _, store := range results { merged.merge(store) } @@ -106,7 +112,7 @@ func fetchEntries( return merged } -func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) NodeStore { +func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) NodeMeta { backoff := retryStartInterval start := time.Now() for { @@ -117,7 +123,7 @@ func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) NodeStor // exponential backoff retry strategy if time.Since(start) > retryTimeout { log.Warn("timeout reached while looking for metadata. Giving up", "error", err) - return NodeStore{} + return NodeMeta{} } log.Debug("can't fetch metadata. Will retry", "retryAfter", backoff, "error", err) select { @@ -125,15 +131,15 @@ func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) NodeStor // continue loop! case <-ctx.Done(): log.Debug("context canceled. Exiting") - return NodeStore{} + return NodeMeta{} } backoff = min(backoff*2, retryMaxInterval) } } -// merges the attributes. On collision, the src NodeStore will overwrite -// the target NodeStore -func (ns *NodeStore) merge(src NodeStore) { +// merges the attributes. On collision, the src NodeMeta will overwrite +// the target NodeMeta +func (ns *NodeMeta) merge(src NodeMeta) { if src.HostID != "" { ns.HostID = src.HostID } @@ -142,6 +148,9 @@ func (ns *NodeStore) merge(src NodeStore) { keyPos[att.Key] = i } for _, entry := range src.Metadata { + if slices.Contains(filterAttrs, entry.Key) { + continue + } if pos, ok := keyPos[entry.Key]; ok { // Key is already in destination: overwrite ns.Metadata[pos] = entry diff --git a/pkg/appolly/meta/meta_node_kube.go b/pkg/appolly/meta/meta_node_kube.go index c1c88d1504..8fe87ba5fd 100644 --- a/pkg/appolly/meta/meta_node_kube.go +++ b/pkg/appolly/meta/meta_node_kube.go @@ -13,27 +13,27 @@ import ( ) func kubeNodeFetcher(k8sInformer *kube.MetadataProvider) fetcher { - return func(ctx context.Context) (NodeStore, error) { + return func(ctx context.Context) (NodeMeta, error) { if !k8sInformer.IsKubeEnabled() { - return NodeStore{}, nil + return NodeMeta{}, nil } nodeName, err := k8sInformer.CurrentNodeName(ctx) if err != nil { - // forwarding an error will force the NodeStore to + // forwarding an error will force the NodeMeta to // retry until timeout - return NodeStore{}, err + return NodeMeta{}, err } kubeClient, err := k8sInformer.KubeClient() if err != nil { - return NodeStore{}, err + return NodeMeta{}, err } nodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{ FieldSelector: "metadata.name=" + nodeName, }) if err != nil || len(nodes.Items) == 0 { - return NodeStore{}, fmt.Errorf("can't get node %s: %w", nodeName, err) + return NodeMeta{}, fmt.Errorf("can't get node %s: %w", nodeName, err) } - return NodeStore{ + return NodeMeta{ HostID: nodes.Items[0].Status.NodeInfo.MachineID, }, nil } diff --git a/pkg/appolly/meta/meta_node_linux.go b/pkg/appolly/meta/meta_node_linux.go index f6e06c2142..4201001208 100644 --- a/pkg/appolly/meta/meta_node_linux.go +++ b/pkg/appolly/meta/meta_node_linux.go @@ -11,7 +11,7 @@ import ( "os" ) -func linuxLocalFetcher(ctx context.Context) (NodeStore, error) { +func linuxLocalFetcher(ctx context.Context) (NodeMeta, error) { mid, err := fetchMachineID() if err != nil { // If we can't read host ID, we don't retry as it is mostly @@ -23,7 +23,7 @@ func linuxLocalFetcher(ctx context.Context) (NodeStore, error) { "component", "meta.linuxLocalFetcher", "error", err) } - return NodeStore{ + return NodeMeta{ HostID: mid, }, nil } diff --git a/pkg/appolly/meta/meta_node_notlinux.go b/pkg/appolly/meta/meta_node_notlinux.go index 671547ec5d..df86cd5213 100644 --- a/pkg/appolly/meta/meta_node_notlinux.go +++ b/pkg/appolly/meta/meta_node_notlinux.go @@ -10,6 +10,6 @@ import ( ) // permits compilation in non-linux environments -func linuxLocalFetcher(_ context.Context) (NodeStore, error) { - return NodeStore{}, nil +func linuxLocalFetcher(_ context.Context) (NodeMeta, error) { + return NodeMeta{}, nil } diff --git a/pkg/appolly/meta/meta_node_otel_detector.go b/pkg/appolly/meta/meta_node_otel_detector.go index 029c1a3491..81a485d798 100644 --- a/pkg/appolly/meta/meta_node_otel_detector.go +++ b/pkg/appolly/meta/meta_node_otel_detector.go @@ -15,21 +15,21 @@ import ( ) func otelNodeFetcher(detector resource.Detector) fetcher { - log := slog.With("component", "meta.NodeStore.otelNodeFetcher", + log := slog.With("component", "meta.NodeMeta.otelNodeFetcher", "detector", fmt.Sprintf("%T", detector)[1:]) - return func(ctx context.Context) (NodeStore, error) { + return func(ctx context.Context) (NodeMeta, error) { resource, err := detector.Detect(ctx) // none of the errors from the ec2 detect are retriable, so we just log them. if err != nil { log.Debug("failed to detect AWS EC2 metadata", "error", err) } if resource == nil { - return NodeStore{}, nil + return NodeMeta{}, nil } // In any case, the API can return an error with a valid (partial resource) attrs := resource.Iter() - store := NodeStore{Metadata: make([]Entry, 0, attrs.Len())} + store := NodeMeta{Metadata: make([]Entry, 0, attrs.Len())} for attrs.Next() { at := attrs.Attribute() if at.Key == semconv.HostIDKey { diff --git a/pkg/appolly/meta/meta_node_test.go b/pkg/appolly/meta/meta_node_test.go index 57557672d4..7d3649d663 100644 --- a/pkg/appolly/meta/meta_node_test.go +++ b/pkg/appolly/meta/meta_node_test.go @@ -20,8 +20,8 @@ func TestFetchEntries_RetryAndKeepOrder(t *testing.T) { synctest.Test(t, func(t *testing.T) { // Create fetchers that fail different numbers of times before succeeding failOnce := makeFetcherThatFailsNTimes(1, "fetcher1", "value1") - alwaysFails := func(_ context.Context) (NodeStore, error) { - return NodeStore{}, errors.New("permanent failure") + alwaysFails := func(_ context.Context) (NodeMeta, error) { + return NodeMeta{}, errors.New("permanent failure") } failTwice := makeFetcherThatFailsNTimes(2, "fetcher2", "value2") succeedImmediately := makeFetcherThatFailsNTimes(0, "fetcher3", "value3") @@ -29,7 +29,7 @@ func TestFetchEntries_RetryAndKeepOrder(t *testing.T) { entries := fetchEntries(t.Context(), failOnce, alwaysFails, failTwice, succeedImmediately) // All fetchers should eventually succeed and return their data - require.Equal(t, NodeStore{ + require.Equal(t, NodeMeta{ HostID: "host_fetcher3", Metadata: []Entry{ {Key: "fetcher1_1", Value: "value1_1"}, @@ -47,35 +47,37 @@ func TestFetchEntries_RetryAndKeepOrder(t *testing.T) { func TestFetchEntries_DeduplicateByPriority(t *testing.T) { entries := fetchEntries(t.Context(), // lowest-priority fetcher - func(_ context.Context) (NodeStore, error) { - return NodeStore{ + func(_ context.Context) (NodeMeta, error) { + return NodeMeta{ HostID: "should-be-overridden", Metadata: []Entry{ + {Key: "host.name", Value: "will-be-filtered"}, {Key: "some.local.stuff", Value: "something"}, {Key: "cloud.stuff", Value: "should-be-overridden"}, - {Key: "host.name", Value: "foo-hostname"}, + {Key: "host.stuff", Value: "foo-stuffname"}, }, }, nil }, // highest-priority fetcher - func(_ context.Context) (NodeStore, error) { - return NodeStore{ + func(_ context.Context) (NodeMeta, error) { + return NodeMeta{ HostID: "vm-01234567", Metadata: []Entry{ {Key: "foo", Value: "bar"}, {Key: "cloud.stuff", Value: "the-cloud-stuff"}, + {Key: "host.name", Value: "will-be-filtered"}, {Key: "baz", Value: "bae"}, }, }, nil }, ) - assert.Equal(t, NodeStore{ + assert.Equal(t, NodeMeta{ HostID: "vm-01234567", Metadata: []Entry{ {Key: "baz", Value: "bae"}, {Key: "cloud.stuff", Value: "the-cloud-stuff"}, {Key: "foo", Value: "bar"}, - {Key: "host.name", Value: "foo-hostname"}, + {Key: "host.stuff", Value: "foo-stuffname"}, {Key: "some.local.stuff", Value: "something"}, }, }, entries) @@ -83,12 +85,12 @@ func TestFetchEntries_DeduplicateByPriority(t *testing.T) { func makeFetcherThatFailsNTimes(failCount int, key, value string) fetcher { attempts := atomic.Int32{} - return func(_ context.Context) (NodeStore, error) { + return func(_ context.Context) (NodeMeta, error) { attempt := attempts.Add(1) if attempt <= int32(failCount) { - return NodeStore{}, errors.New("simulated failure") + return NodeMeta{}, errors.New("simulated failure") } - return NodeStore{ + return NodeMeta{ HostID: "host_" + key, Metadata: []Entry{ {Key: attr.Name(key + "_1"), Value: value + "_1"}, diff --git a/pkg/export/otel/metrics.go b/pkg/export/otel/metrics.go index 7b6eefd789..9da3c435eb 100644 --- a/pkg/export/otel/metrics.go +++ b/pkg/export/otel/metrics.go @@ -74,7 +74,7 @@ type MetricsReporter struct { ctx context.Context cfg *otelcfg.MetricsConfig jointMetricsCfg *perapp.MetricsConfig - nodeMeta meta.NodeStore + nodeMeta meta.NodeMeta attributes *attributes.AttrSelector exporter sdkmetric.Exporter reporters otelcfg.ReporterPool[*svc.Attrs, *Metrics] @@ -653,7 +653,7 @@ func (mr *MetricsReporter) newMetricsInstance(service *svc.Attrs) Metrics { var resourceAttributes []attribute.KeyValue if service != nil { mlog = mlog.With("service", service) - resourceAttributes = append(otelcfg.GetAppResourceAttrs(mr.nodeMeta.HostID, service), otelcfg.ResourceAttrsFromEnv(service)...) + resourceAttributes = append(otelcfg.GetAppResourceAttrs(&mr.nodeMeta, service), otelcfg.ResourceAttrsFromEnv(service)...) } mlog.Debug("creating new Metrics reporter") resources := resource.NewWithAttributes(semconv.SchemaURL, resourceAttributes...) @@ -786,14 +786,16 @@ func (mr *MetricsReporter) tracesResourceAttributes(service *svc.Attrs) attribut semconv.OSTypeKey.String("linux"), } - extraAttrs := []attribute.KeyValue{ - semconv.HostID(mr.nodeMeta.HostID), - } - + extraAttrs := make([]attribute.KeyValue, 0, len(service.Metadata)+len(mr.nodeMeta.Metadata)+1) + extraAttrs = append(extraAttrs, semconv.HostID(mr.nodeMeta.HostID)) for k, v := range service.Metadata { extraAttrs = append(extraAttrs, k.OTEL().String(v)) } + for _, entry := range mr.nodeMeta.Metadata { + extraAttrs = append(extraAttrs, entry.Key.OTEL().String(entry.Value)) + } + filteredAttrs := otelcfg.GetFilteredAttributesByPrefix(baseAttrs, mr.userAttribSelection, extraAttrs, MetricTypes) return attribute.NewSet(filteredAttrs...) } @@ -1041,7 +1043,7 @@ func (mr *MetricsReporter) resourceAttrsForService(service *svc.Attrs) []attribu attribute.String(string(attr.Job), service.Job()), } - attrs = append(attrs, otelcfg.GetAppResourceAttrs(mr.nodeMeta.HostID, service)...) + attrs = append(attrs, otelcfg.GetAppResourceAttrs(&mr.nodeMeta, service)...) return append(attrs, otelcfg.ResourceAttrsFromEnv(service)...) } diff --git a/pkg/export/otel/metrics_internal.go b/pkg/export/otel/metrics_internal.go index e4d7fb3348..17d61c3dac 100644 --- a/pkg/export/otel/metrics_internal.go +++ b/pkg/export/otel/metrics_internal.go @@ -17,6 +17,7 @@ import ( "go.opentelemetry.io/otel/sdk/resource" semconv "go.opentelemetry.io/otel/semconv/v1.38.0" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/buildinfo" attr "go.opentelemetry.io/obi/pkg/export/attributes/names" "go.opentelemetry.io/obi/pkg/export/imetrics" @@ -56,7 +57,7 @@ func NewInternalMetricsReporter(ctx context.Context, ctxInfo *global.ContextInfo return nil, err } - res := newResourceInternal(ctxInfo.NodeMeta.HostID) + res := newResourceInternal(&ctxInfo.NodeMeta) provider := newInternalMeterProvider(res, &exporter, metrics.Interval) meter := provider.Meter("obi_internal") tracerFlushes, err := meter.Float64Histogram( @@ -239,13 +240,17 @@ func (p *InternalMetricsReporter) InstrumentationError(processName, errorType st )) } -func newResourceInternal(hostID string) *resource.Resource { +func newResourceInternal(nodeMeta *meta.NodeMeta) *resource.Resource { attrs := []attribute.KeyValue{ semconv.ServiceName(attr.VendorSDKName), semconv.ServiceInstanceID(uuid.New().String()), semconv.TelemetrySDKLanguageKey.String(semconv.TelemetrySDKLanguageGo.Value.AsString()), semconv.TelemetrySDKNameKey.String(attr.VendorSDKName), - semconv.HostID(hostID), + semconv.HostID(nodeMeta.HostID), + } + + for _, event := range nodeMeta.Metadata { + attrs = append(attrs, event.Key.OTEL().String(event.Value)) } return resource.NewWithAttributes(semconv.SchemaURL, attrs...) diff --git a/pkg/export/otel/metrics_svc_graph.go b/pkg/export/otel/metrics_svc_graph.go index eeb127d3df..953139ab61 100644 --- a/pkg/export/otel/metrics_svc_graph.go +++ b/pkg/export/otel/metrics_svc_graph.go @@ -8,6 +8,7 @@ import ( "fmt" "log/slog" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/otel/attribute" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" @@ -46,7 +47,7 @@ const ( type SvcGraphMetricsReporter struct { ctx context.Context cfg *otelcfg.MetricsConfig - hostID string + nodeMeta meta.NodeMeta exporter sdkmetric.Exporter reporters otelcfg.ReporterPool[*svc.Attrs, *SvcGraphMetrics] pidTracker PidServiceTracker @@ -120,7 +121,7 @@ func newSvcGraphMetricsReporter( ctx: ctx, cfg: cfg, is: is, - hostID: ctxInfo.NodeMeta.HostID, + nodeMeta: ctxInfo.NodeMeta, input: input.Subscribe(msg.SubscriberName("otel.SvcGraphMetricsReporter.input")), processEvents: processEventCh.Subscribe(msg.SubscriberName("otel.SvcGraphMetricsReporter.processEvents")), metricAttributes: serviceGraphGetters(unresolved, ctxInfo.K8sInformer.IsKubeEnabled()), @@ -200,7 +201,7 @@ func (mr *SvcGraphMetricsReporter) newSvcGraphMetricsInstance(service *svc.Attrs var resourceAttributes []attribute.KeyValue if service != nil { log = log.With("service", service) - resourceAttributes = append(otelcfg.GetAppResourceAttrs(mr.hostID, service), otelcfg.ResourceAttrsFromEnv(service)...) + resourceAttributes = append(otelcfg.GetAppResourceAttrs(&mr.nodeMeta, service), otelcfg.ResourceAttrsFromEnv(service)...) } log.Debug("creating new Metrics reporter") resources := resource.NewWithAttributes(semconv.SchemaURL, resourceAttributes...) @@ -259,7 +260,7 @@ func (mr *SvcGraphMetricsReporter) tracesResourceAttributes(service *svc.Attrs) } extraAttrs := []attribute.KeyValue{ - semconv.HostID(mr.hostID), + semconv.HostID(mr.nodeMeta.HostID), } for k, v := range service.Metadata { diff --git a/pkg/export/otel/metrics_test.go b/pkg/export/otel/metrics_test.go index 5ed9dea573..4808aeee7a 100644 --- a/pkg/export/otel/metrics_test.go +++ b/pkg/export/otel/metrics_test.go @@ -809,7 +809,7 @@ func TestMetricResourceAttributes(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { mr := &MetricsReporter{ - nodeMeta: meta.NodeStore{HostID: "test-host-id"}, + nodeMeta: meta.NodeMeta{HostID: "test-host-id"}, userAttribSelection: tc.attributeSelect, } diff --git a/pkg/export/otel/otelcfg/common.go b/pkg/export/otel/otelcfg/common.go index 9e19dea986..6aebc4c1e7 100644 --- a/pkg/export/otel/otelcfg/common.go +++ b/pkg/export/otel/otelcfg/common.go @@ -28,6 +28,7 @@ import ( semconv "go.opentelemetry.io/otel/semconv/v1.38.0" "go.opentelemetry.io/obi/pkg/appolly/app/svc" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/buildinfo" "go.opentelemetry.io/obi/pkg/config" "go.opentelemetry.io/obi/pkg/export/attributes" @@ -93,13 +94,13 @@ func omitFieldsForYAML(input any, omitFields map[string]struct{}) map[string]any return result } -func GetAppResourceAttrs(hostID string, service *svc.Attrs) []attribute.KeyValue { - return append(GetResourceAttrs(hostID, service), +func GetAppResourceAttrs(nodeMeta *meta.NodeMeta, service *svc.Attrs) []attribute.KeyValue { + return append(GetResourceAttrs(nodeMeta, service), semconv.ServiceInstanceID(service.UID.Instance), ) } -func GetResourceAttrs(hostID string, service *svc.Attrs) []attribute.KeyValue { +func GetResourceAttrs(nodeMeta *meta.NodeMeta, service *svc.Attrs) []attribute.KeyValue { attrs := []attribute.KeyValue{ semconv.ServiceName(service.UID.Name), // SpanMetrics requires an extra attribute besides service name @@ -111,7 +112,7 @@ func GetResourceAttrs(hostID string, service *svc.Attrs) []attribute.KeyValue { semconv.TelemetrySDKNameKey.String(attr.VendorSDKName), semconv.TelemetrySDKVersion(buildinfo.Version), semconv.HostName(service.HostName), - semconv.HostID(hostID), + semconv.HostID(nodeMeta.HostID), semconv.OSTypeLinux, } @@ -122,6 +123,10 @@ func GetResourceAttrs(hostID string, service *svc.Attrs) []attribute.KeyValue { for k, v := range service.Metadata { attrs = append(attrs, k.OTEL().String(v)) } + + for _, entry := range nodeMeta.Metadata { + attrs = append(attrs, entry.Key.OTEL().String(entry.Value)) + } return attrs } diff --git a/pkg/export/otel/traces.go b/pkg/export/otel/traces.go index b8d62d069b..1a76b28404 100644 --- a/pkg/export/otel/traces.go +++ b/pkg/export/otel/traces.go @@ -125,7 +125,7 @@ func (tr *tracesOTELReceiver) processSpans(ctx context.Context, exp exporter.Tra if tr.spanMetricsEnabled { envResourceAttrs = append(envResourceAttrs, attribute.Bool(string(attr.SkipSpanMetrics.OTEL()), true)) } - traces := tracesgen.GenerateTracesWithAttributes(tr.attributeCache, &sample.Span.Service, envResourceAttrs, tr.ctxInfo.NodeMeta.HostID, spanGroup, reporterName, tr.ctxInfo.ExtraResourceAttributes...) + traces := tracesgen.GenerateTracesWithAttributes(tr.attributeCache, &sample.Span.Service, envResourceAttrs, &tr.ctxInfo.NodeMeta, spanGroup, reporterName, tr.ctxInfo.ExtraResourceAttributes...) err := exp.ConsumeTraces(ctx, traces) if err != nil { // We can't do if errors.Is(err, queue.ErrQueueIsFull), since the queue package is internal diff --git a/pkg/export/otel/traces_test.go b/pkg/export/otel/traces_test.go index 6b0f7aeb42..71dcb850dc 100644 --- a/pkg/export/otel/traces_test.go +++ b/pkg/export/otel/traces_test.go @@ -27,6 +27,7 @@ import ( "go.opentelemetry.io/obi/pkg/appolly/app/request" "go.opentelemetry.io/obi/pkg/appolly/app/svc" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/export/attributes" attr "go.opentelemetry.io/obi/pkg/export/attributes/names" "go.opentelemetry.io/obi/pkg/export/instrumentations" @@ -40,6 +41,8 @@ import ( var cache = expirable2.NewLRU[svc.UID, []attribute.KeyValue](1024, nil, 5*time.Minute) +var hostID = &meta.NodeMeta{HostID: "host-id"} + func BenchmarkGenerateTraces(b *testing.B) { start := time.Now() @@ -66,7 +69,7 @@ func BenchmarkGenerateTraces(b *testing.B) { group := groupFromSpanAndAttributes(span, attrs) for b.Loop() { - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, attrs, "host-id", group, reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, attrs, hostID, group, reporterName) if traces.ResourceSpans().Len() == 0 { b.Fatal("Generated traces is empty") @@ -100,7 +103,7 @@ func TestGenerateTraces(t *testing.T) { Service: svc.Attrs{UID: svc.UID{Name: "1"}}, } - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -145,7 +148,7 @@ func TestGenerateTraces(t *testing.T) { SpanID: spanID, TraceID: traceID, } - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -181,7 +184,7 @@ func TestGenerateTraces(t *testing.T) { Route: "/test", Status: 200, } - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -218,7 +221,7 @@ func TestGenerateTraces(t *testing.T) { SpanID: spanID, TraceID: traceID, } - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -244,7 +247,7 @@ func TestGenerateTraces(t *testing.T) { TraceID: traceID, } - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -264,7 +267,7 @@ func TestGenerateTraces(t *testing.T) { Method: "GET", Route: "/test", } - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(span, []attribute.KeyValue{}), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -280,7 +283,7 @@ func TestGenerateTracesAttributes(t *testing.T) { t.Run("test SQL trace generation, no statement", func(t *testing.T) { span := makeSQLRequestSpan("SELECT password FROM credentials WHERE username=\"bill\"") tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -302,7 +305,7 @@ func TestGenerateTracesAttributes(t *testing.T) { t.Run("test SQL trace generation, unknown attribute", func(t *testing.T) { span := makeSQLRequestSpan("SELECT password, name FROM credentials WHERE username=\"bill\"") tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{"db.operation.name": {}}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -324,7 +327,7 @@ func TestGenerateTracesAttributes(t *testing.T) { t.Run("test SQL trace generation, unknown attribute", func(t *testing.T) { span := makeSQLRequestSpan("SELECT password FROM credentials WHERE username=\"bill\"") tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{attr.DBQueryText: {}}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -346,7 +349,7 @@ func TestGenerateTracesAttributes(t *testing.T) { t.Run("test SQL trace generation, error", func(t *testing.T) { span := makeSQLRequestErroredSpan("SELECT * FROM obi.nonexisting") tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{attr.DBQueryText: {}}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -374,7 +377,7 @@ func TestGenerateTracesAttributes(t *testing.T) { t.Run("test Kafka trace generation", func(t *testing.T) { span := request.Span{Type: request.EventTypeKafkaClient, Method: "process", Path: "important-topic", Statement: "test"} tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -393,7 +396,7 @@ func TestGenerateTracesAttributes(t *testing.T) { t.Run("test MQTT trace generation", func(t *testing.T) { span := request.Span{Type: request.EventTypeMQTTClient, Method: "publish", Path: "sensors/temperature", Statement: "mqtt-client-1"} tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -412,7 +415,7 @@ func TestGenerateTracesAttributes(t *testing.T) { t.Run("test Mongo trace generation", func(t *testing.T) { span := request.Span{Type: request.EventTypeMongoClient, Method: "insert", Path: "mycollection", DBNamespace: "mydatabase", Status: 0} tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{"db.operation.name": {}}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -435,7 +438,7 @@ func TestGenerateTracesAttributes(t *testing.T) { t.Run("test Mongo trace generation with error", func(t *testing.T) { span := request.Span{Type: request.EventTypeMongoClient, Method: "insert", Path: "mycollection", DBNamespace: "mydatabase", Status: 1, DBError: request.DBError{ErrorCode: "1", Description: "Internal MongoDB error"}} tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{"db.operation.name": {}}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -460,7 +463,7 @@ func TestGenerateTracesAttributes(t *testing.T) { t.Run("test Couchbase trace generation", func(t *testing.T) { span := request.Span{Type: request.EventTypeCouchbaseClient, Method: "GET", Path: "mycollection", DBNamespace: "mybucket.myscope", Status: 0} tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{"db.operation.name": {}}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -483,7 +486,7 @@ func TestGenerateTracesAttributes(t *testing.T) { t.Run("test Couchbase trace generation with error", func(t *testing.T) { span := request.Span{Type: request.EventTypeCouchbaseClient, Method: "GET", Path: "mycollection", DBNamespace: "mybucket.myscope", Status: 1, DBError: request.DBError{ErrorCode: "1", Description: "KEY_NOT_FOUND"}} tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{"db.operation.name": {}}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -519,7 +522,7 @@ func TestGenerateTracesAttributes(t *testing.T) { Status: 200, } tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{attr.DBQueryText: {}}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -558,7 +561,7 @@ func TestGenerateTracesAttributes(t *testing.T) { } // Without db.query.text in optional attributes tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) spans := traces.ResourceSpans().At(0).ScopeSpans().At(0).Spans() @@ -590,7 +593,7 @@ func TestGenerateTracesAttributes(t *testing.T) { }, } tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{attr.DBQueryText: {}}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) assert.Equal(t, 1, traces.ResourceSpans().At(0).ScopeSpans().Len()) @@ -625,7 +628,7 @@ func TestGenerateTracesAttributes(t *testing.T) { Status: 200, } tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) spans := traces.ResourceSpans().At(0).ScopeSpans().At(0).Spans() @@ -649,7 +652,7 @@ func TestGenerateTracesAttributes(t *testing.T) { span := request.Span{Type: request.EventTypeHTTP, Method: "GET", Route: "/test", Status: 200} tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{}) - traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, otelcfg.ResourceAttrsFromEnv(&span.Service), "host-id", groupFromSpanAndAttributes(&span, tAttrs), reporterName) + traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, otelcfg.ResourceAttrsFromEnv(&span.Service), hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName) assert.Equal(t, 1, traces.ResourceSpans().Len()) rs := traces.ResourceSpans().At(0) @@ -662,7 +665,7 @@ func TestGenerateTracesAttributes(t *testing.T) { tAttrs := tracesgen.TraceAttributesSelector(&span, map[attr.Name]struct{}{}) traces := tracesgen.GenerateTracesWithAttributes(cache, &span.Service, - otelcfg.ResourceAttrsFromEnv(&span.Service), "host-id", + otelcfg.ResourceAttrsFromEnv(&span.Service), hostID, groupFromSpanAndAttributes(&span, tAttrs), reporterName, attribute.String("deployment.environment", "productions"), @@ -1069,10 +1072,11 @@ func TestTracesAttrReuse(t *testing.T) { }, } + host123 := &meta.NodeMeta{HostID: "123"} for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - attr1 := tracesgen.TraceAppResourceAttrs(cache, "123", &tt.span.Service) - attr2 := tracesgen.TraceAppResourceAttrs(cache, "123", &tt.span.Service) + attr1 := tracesgen.TraceAppResourceAttrs(cache, host123, &tt.span.Service) + attr2 := tracesgen.TraceAppResourceAttrs(cache, host123, &tt.span.Service) assert.Equal(t, tt.same, &attr1[0] == &attr2[0], tt.name) }) } @@ -1602,7 +1606,7 @@ func generateTracesForSpans(t *testing.T, tr *tracesOTELReceiver, spans []reques } tAttrs := tracesgen.TraceAttributesSelector(span, traceAttrs) - res = append(res, tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, "host-id", groupFromSpanAndAttributes(span, tAttrs), reporterName)) + res = append(res, tracesgen.GenerateTracesWithAttributes(cache, &span.Service, []attribute.KeyValue{}, hostID, groupFromSpanAndAttributes(span, tAttrs), reporterName)) } return res diff --git a/pkg/export/otel/tracesgen/tracesgen.go b/pkg/export/otel/tracesgen/tracesgen.go index 276bc98881..ca2cb8bb3c 100644 --- a/pkg/export/otel/tracesgen/tracesgen.go +++ b/pkg/export/otel/tracesgen/tracesgen.go @@ -24,6 +24,7 @@ import ( "go.opentelemetry.io/obi/pkg/appolly/app/request" "go.opentelemetry.io/obi/pkg/appolly/app/svc" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/ebpf/common/dnsparser" "go.opentelemetry.io/obi/pkg/export/attributes" attr "go.opentelemetry.io/obi/pkg/export/attributes/names" @@ -112,14 +113,14 @@ func GenerateTracesWithAttributes( cache *expirable2.LRU[svc.UID, []attribute.KeyValue], svc *svc.Attrs, envResourceAttrs []attribute.KeyValue, - hostID string, + nodeMeta *meta.NodeMeta, spans []TraceSpanAndAttributes, reporterName string, extraResAttrs ...attribute.KeyValue, ) ptrace.Traces { traces := ptrace.NewTraces() rs := traces.ResourceSpans().AppendEmpty() - resourceAttrs := TraceAppResourceAttrs(cache, hostID, svc) + resourceAttrs := TraceAppResourceAttrs(cache, nodeMeta, svc) resourceAttrs = append(resourceAttrs, envResourceAttrs...) resourceAttrsMap := AttrsToMap(resourceAttrs) resourceAttrsMap.PutStr(string(semconv.OTelScopeNameKey), reporterName) @@ -211,17 +212,17 @@ func createSubSpans(span *request.Span, parentSpanID pcommon.SpanID, traceID pco var emptyUID = svc.UID{} -func TraceAppResourceAttrs(cache *expirable2.LRU[svc.UID, []attribute.KeyValue], hostID string, service *svc.Attrs) []attribute.KeyValue { +func TraceAppResourceAttrs(cache *expirable2.LRU[svc.UID, []attribute.KeyValue], nodeMeta *meta.NodeMeta, service *svc.Attrs) []attribute.KeyValue { // TODO: remove? if service.UID == emptyUID { - return otelcfg.GetAppResourceAttrs(hostID, service) + return otelcfg.GetAppResourceAttrs(nodeMeta, service) } attrs, ok := cache.Get(service.UID) if ok { return attrs } - attrs = otelcfg.GetAppResourceAttrs(hostID, service) + attrs = otelcfg.GetAppResourceAttrs(nodeMeta, service) cache.Add(service.UID, attrs) return attrs diff --git a/pkg/export/prom/prom.go b/pkg/export/prom/prom.go index 56349730a1..a2c6a47587 100644 --- a/pkg/export/prom/prom.go +++ b/pkg/export/prom/prom.go @@ -18,6 +18,7 @@ import ( "go.opentelemetry.io/obi/pkg/appolly/app/request" "go.opentelemetry.io/obi/pkg/appolly/app/svc" "go.opentelemetry.io/obi/pkg/appolly/discover/exec" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/buildinfo" "go.opentelemetry.io/obi/pkg/export" "go.opentelemetry.io/obi/pkg/export/attributes" @@ -232,7 +233,7 @@ type metricsReporter struct { kubeEnabled bool dockerEnabled bool - hostID string + nodeMeta meta.NodeMeta serviceMap map[svc.UID]svc.Attrs pidsTracker otel.PidServiceTracker @@ -401,7 +402,7 @@ func newReporter( dockerEnabled: dockerEnabled, extraMetadataLabels: extraMetadataLabels, extraSpanMetadataLabels: extraSpanMetadataLabels, - hostID: ctxInfo.NodeMeta.HostID, + nodeMeta: ctxInfo.NodeMeta, clock: clock, is: is, promConnect: ctxInfo.Prometheus, @@ -579,7 +580,7 @@ func newReporter( return prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: TracesTargetInfo, Help: "target service information in trace span metric format", - }, labelNamesTargetInfo(kubeEnabled, dockerEnabled, extraMetadataLabels)) + }, labelNamesTargetInfo(kubeEnabled, dockerEnabled, &ctxInfo.NodeMeta, extraMetadataLabels)) }), tracesHostInfo: optionalGaugeProvider(jointMetricsConfig.Features.AppHost(), func() *Expirer[prometheus.Gauge] { return NewExpirer[prometheus.Gauge](prometheus.NewGaugeVec(prometheus.GaugeOpts{ @@ -622,7 +623,7 @@ func newReporter( targetInfo: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: TargetInfo, Help: "attributes associated to a given monitored entity", - }, labelNamesTargetInfo(kubeEnabled, dockerEnabled, extraMetadataLabels)), + }, labelNamesTargetInfo(kubeEnabled, dockerEnabled, &ctxInfo.NodeMeta, extraMetadataLabels)), cudaKernelCallsTotal: optionalCounterProvider(is.GPUEnabled(), func() *Expirer[prometheus.Counter] { return NewExpirer[prometheus.Counter](prometheus.NewCounterVec(prometheus.CounterOpts{ Name: attributes.GPUCudaKernelLaunchCalls.Prom, @@ -863,7 +864,7 @@ func (r *metricsReporter) observe(span *request.Span) { t := span.Timings() r.beylaInfo.WithLabelValues(span.Service.SDKLanguage.String()).Metric.Set(1.0) if span.Service.Features.AppHost() { - r.tracesHostInfo.WithLabelValues(r.hostID).Metric.Set(1.0) + r.tracesHostInfo.WithLabelValues(r.nodeMeta.HostID).Metric.Set(1.0) } duration := t.End.Sub(t.RequestStart).Seconds() @@ -1097,7 +1098,7 @@ func (r *metricsReporter) labelValuesSpans(span *request.Span) []string { return values } -func labelNamesTargetInfo(kubeEnabled, dockerEnabled bool, extraMetadataLabelNames []attr.Name) []string { +func labelNamesTargetInfo(kubeEnabled, dockerEnabled bool, nodeMeta *meta.NodeMeta, extraMetadataLabelNames []attr.Name) []string { names := []string{ hostIDKey, hostNameKey, @@ -1119,6 +1120,10 @@ func labelNamesTargetInfo(kubeEnabled, dockerEnabled bool, extraMetadataLabelNam names = appendDockerLabelNames(names) } + for _, entry := range nodeMeta.Metadata { + names = append(names, entry.Key.Prom()) + } + for _, mdn := range extraMetadataLabelNames { names = append(names, mdn.Prom()) } @@ -1128,7 +1133,7 @@ func labelNamesTargetInfo(kubeEnabled, dockerEnabled bool, extraMetadataLabelNam func (r *metricsReporter) labelValuesTargetInfo(service *svc.Attrs) []string { values := []string{ - r.hostID, + r.nodeMeta.HostID, service.HostName, service.UID.Name, service.UID.Namespace, @@ -1149,6 +1154,10 @@ func (r *metricsReporter) labelValuesTargetInfo(service *svc.Attrs) []string { values = appendDockerLabelValuesService(values, service) } + for _, entry := range r.nodeMeta.Metadata { + values = append(values, entry.Value) + } + for _, k := range r.extraMetadataLabels { values = append(values, service.Metadata[k]) } diff --git a/pkg/export/prom/prom_test.go b/pkg/export/prom/prom_test.go index 20058e0e4a..6814289eb0 100644 --- a/pkg/export/prom/prom_test.go +++ b/pkg/export/prom/prom_test.go @@ -59,7 +59,7 @@ func TestAppMetricsExpiration(t *testing.T) { exporter, err := PrometheusEndpoint( &global.ContextInfo{ Prometheus: &connector.PrometheusManager{}, - NodeMeta: meta.NodeStore{HostID: "my-host"}, + NodeMeta: meta.NodeMeta{HostID: "my-host"}, MetricAttributeGroups: g, }, &PrometheusConfig{ diff --git a/pkg/pipe/global/context.go b/pkg/pipe/global/context.go index 80d9168903..3e07eaa4ee 100644 --- a/pkg/pipe/global/context.go +++ b/pkg/pipe/global/context.go @@ -23,7 +23,7 @@ import ( type ContextInfo struct { // NodeMeta of the node (physical, VM, cloud instance...) running OBI. // Including the HostID and other host metadata Attributes - NodeMeta meta.NodeStore + NodeMeta meta.NodeMeta // AppO11y stores context information that is only required for application observability. // Its values must be initialized by the App O11y code and shouldn't be accessed from the From 45a0f520cfeb29d3303d0a54a5117f5e875571b7 Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Fri, 13 Feb 2026 16:27:07 +0100 Subject: [PATCH 07/17] add timeout to OTEL detectors --- pkg/appolly/meta/meta_node.go | 6 ++++-- .../{meta_node_otel_detector.go => meta_node_cloud.go} | 3 +++ pkg/appolly/meta/meta_node_linux.go | 2 +- pkg/appolly/meta/meta_node_test.go | 7 +++++++ pkg/export/otel/metrics_svc_graph.go | 2 +- pkg/instrumenter/instrumenter.go | 2 +- 6 files changed, 17 insertions(+), 5 deletions(-) rename pkg/appolly/meta/{meta_node_otel_detector.go => meta_node_cloud.go} (89%) diff --git a/pkg/appolly/meta/meta_node.go b/pkg/appolly/meta/meta_node.go index de013ba0d8..3f87d9e3b4 100644 --- a/pkg/appolly/meta/meta_node.go +++ b/pkg/appolly/meta/meta_node.go @@ -30,6 +30,8 @@ const ( retryMaxInterval = 5 * time.Second ) +var connectionTimeout = 2 * time.Second + // some attributes from the node need to be filtered out, because they are // going to be specified for each service instance var filterAttrs []attr.Name = []attr.Name{ @@ -61,7 +63,7 @@ type Entry struct { Value string } -func NewNodeStore( +func NewNodeMeta( ctx context.Context, overrideHost string, kubeInformer *kube.MetadataProvider, @@ -122,7 +124,7 @@ func backoffFetch(ctx context.Context, fetch fetcher, log *slog.Logger) NodeMeta } // exponential backoff retry strategy if time.Since(start) > retryTimeout { - log.Warn("timeout reached while looking for metadata. Giving up", "error", err) + log.Debug("timeout reached while looking for metadata. Giving up", "error", err) return NodeMeta{} } log.Debug("can't fetch metadata. Will retry", "retryAfter", backoff, "error", err) diff --git a/pkg/appolly/meta/meta_node_otel_detector.go b/pkg/appolly/meta/meta_node_cloud.go similarity index 89% rename from pkg/appolly/meta/meta_node_otel_detector.go rename to pkg/appolly/meta/meta_node_cloud.go index 81a485d798..e687b9f4fa 100644 --- a/pkg/appolly/meta/meta_node_otel_detector.go +++ b/pkg/appolly/meta/meta_node_cloud.go @@ -19,6 +19,9 @@ func otelNodeFetcher(detector resource.Detector) fetcher { "detector", fmt.Sprintf("%T", detector)[1:]) return func(ctx context.Context) (NodeMeta, error) { + // we expect very short response time in a cloud environment + ctx, cancel := context.WithTimeout(ctx, connectionTimeout) + defer cancel() resource, err := detector.Detect(ctx) // none of the errors from the ec2 detect are retriable, so we just log them. if err != nil { diff --git a/pkg/appolly/meta/meta_node_linux.go b/pkg/appolly/meta/meta_node_linux.go index 4201001208..5adcd2b0d0 100644 --- a/pkg/appolly/meta/meta_node_linux.go +++ b/pkg/appolly/meta/meta_node_linux.go @@ -11,7 +11,7 @@ import ( "os" ) -func linuxLocalFetcher(ctx context.Context) (NodeMeta, error) { +func linuxLocalFetcher(_ context.Context) (NodeMeta, error) { mid, err := fetchMachineID() if err != nil { // If we can't read host ID, we don't retry as it is mostly diff --git a/pkg/appolly/meta/meta_node_test.go b/pkg/appolly/meta/meta_node_test.go index 7d3649d663..80109e7c38 100644 --- a/pkg/appolly/meta/meta_node_test.go +++ b/pkg/appolly/meta/meta_node_test.go @@ -9,6 +9,7 @@ import ( "sync/atomic" "testing" "testing/synctest" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -83,6 +84,12 @@ func TestFetchEntries_DeduplicateByPriority(t *testing.T) { }, entries) } +func TestHostIDOverride(t *testing.T) { + connectionTimeout = 50 * time.Millisecond + nm := NewNodeMeta(t.Context(), "host_override", nil) + assert.Equal(t, "host_override", nm.HostID) +} + func makeFetcherThatFailsNTimes(failCount int, key, value string) fetcher { attempts := atomic.Int32{} return func(_ context.Context) (NodeMeta, error) { diff --git a/pkg/export/otel/metrics_svc_graph.go b/pkg/export/otel/metrics_svc_graph.go index 953139ab61..2678b5231a 100644 --- a/pkg/export/otel/metrics_svc_graph.go +++ b/pkg/export/otel/metrics_svc_graph.go @@ -8,7 +8,6 @@ import ( "fmt" "log/slog" - "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/otel/attribute" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" @@ -19,6 +18,7 @@ import ( "go.opentelemetry.io/obi/pkg/appolly/app/request" "go.opentelemetry.io/obi/pkg/appolly/app/svc" "go.opentelemetry.io/obi/pkg/appolly/discover/exec" + "go.opentelemetry.io/obi/pkg/appolly/meta" "go.opentelemetry.io/obi/pkg/export/attributes" attr "go.opentelemetry.io/obi/pkg/export/attributes/names" "go.opentelemetry.io/obi/pkg/export/instrumentations" diff --git a/pkg/instrumenter/instrumenter.go b/pkg/instrumenter/instrumenter.go index 82d312ee56..7549f6d24c 100644 --- a/pkg/instrumenter/instrumenter.go +++ b/pkg/instrumenter/instrumenter.go @@ -190,7 +190,7 @@ func BuildCommonContextInfo( ServiceNameTemplate: templ, }, ctxInfo.Metrics) - ctxInfo.NodeMeta = meta.NewNodeStore( + ctxInfo.NodeMeta = meta.NewNodeMeta( ctx, config.Attributes.HostID.Override, ctxInfo.K8sInformer, From 2cb3522dbaf19c007391652ac856f69c8f31f360 Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Tue, 17 Feb 2026 10:39:43 +0100 Subject: [PATCH 08/17] Fixed crash --- pkg/appolly/discover/elf_test.go | 5 +++++ pkg/appolly/meta/meta_node_cloud.go | 11 +++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pkg/appolly/discover/elf_test.go b/pkg/appolly/discover/elf_test.go index 636f637eb1..060f833f88 100644 --- a/pkg/appolly/discover/elf_test.go +++ b/pkg/appolly/discover/elf_test.go @@ -6,6 +6,7 @@ package discover import ( "os" "reflect" + "runtime" "testing" "go.opentelemetry.io/obi/pkg/appolly/app" @@ -14,6 +15,10 @@ import ( ) func TestFindINodeForPID(t *testing.T) { + if runtime.GOOS != "linux" { + t.Skip("skipping FindINodeForPID test on non-linux platform") + } + // Use our own PID — guaranteed to exist and have a valid /proc//exe self := app.PID(os.Getpid()) diff --git a/pkg/appolly/meta/meta_node_cloud.go b/pkg/appolly/meta/meta_node_cloud.go index e687b9f4fa..d24d42972b 100644 --- a/pkg/appolly/meta/meta_node_cloud.go +++ b/pkg/appolly/meta/meta_node_cloud.go @@ -8,10 +8,9 @@ import ( "fmt" "log/slog" + attr "go.opentelemetry.io/obi/pkg/export/attributes/names" "go.opentelemetry.io/otel/sdk/resource" semconv "go.opentelemetry.io/otel/semconv/v1.38.0" - - attr "go.opentelemetry.io/obi/pkg/export/attributes/names" ) func otelNodeFetcher(detector resource.Detector) fetcher { @@ -35,9 +34,13 @@ func otelNodeFetcher(detector resource.Detector) fetcher { store := NodeMeta{Metadata: make([]Entry, 0, attrs.Len())} for attrs.Next() { at := attrs.Attribute() - if at.Key == semconv.HostIDKey { + switch at.Key { + case semconv.HostIDKey: store.HostID = at.Value.Emit() - } else { + case semconv.OSTypeKey: + // we ignore some values that are explicitly added in the + // exporters and would cause attributes duplication (panic) + default: store.Metadata = append(store.Metadata, Entry{Key: attr.Name(at.Key), Value: at.Value.Emit()}) } From 3b351e21e6afb33245ae176850e70e327eb42325 Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Tue, 17 Feb 2026 13:58:16 +0100 Subject: [PATCH 09/17] fix format --- pkg/appolly/meta/meta_node_cloud.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/appolly/meta/meta_node_cloud.go b/pkg/appolly/meta/meta_node_cloud.go index d24d42972b..4e875b71dc 100644 --- a/pkg/appolly/meta/meta_node_cloud.go +++ b/pkg/appolly/meta/meta_node_cloud.go @@ -8,9 +8,10 @@ import ( "fmt" "log/slog" - attr "go.opentelemetry.io/obi/pkg/export/attributes/names" "go.opentelemetry.io/otel/sdk/resource" semconv "go.opentelemetry.io/otel/semconv/v1.38.0" + + attr "go.opentelemetry.io/obi/pkg/export/attributes/names" ) func otelNodeFetcher(detector resource.Detector) fetcher { From b184617055e959c5b6cd6738a29fd61aa83cfcbd Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Tue, 17 Feb 2026 16:05:05 +0100 Subject: [PATCH 10/17] AWS metrics metadata tests --- internal/test/integration/aws_test.go | 59 ++++++++++++------- .../integration/configs/obi-config-dual.yml | 25 ++++++++ .../configs/prometheus-config-perapp.yml | 1 + internal/test/integration/dockerutil_test.go | 14 ++++- 4 files changed, 76 insertions(+), 23 deletions(-) create mode 100644 internal/test/integration/configs/obi-config-dual.yml diff --git a/internal/test/integration/aws_test.go b/internal/test/integration/aws_test.go index c2d53ba5e8..97d020764b 100644 --- a/internal/test/integration/aws_test.go +++ b/internal/test/integration/aws_test.go @@ -51,11 +51,9 @@ func setupMockIMDS(t *testing.T, network *dockertest.Network) { } // This file contains tests related with the integration with Amazon Web Services -func TestClusterName(t *testing.T) { - clusterName := "test-eks-cluster" - +func TestCloudResourceMetadata(t *testing.T) { network := setupDockerNetwork(t) - setupContainerPrometheus(t, network, "prometheus-config.yml") + setupContainerPrometheus(t, network, "prometheus-config-perapp.yml") setupContainerJaeger(t, network) setupContainerCollector(t, network, "otelcol-config.yml") setupMockIMDS(t, network) @@ -70,6 +68,7 @@ func TestClusterName(t *testing.T) { // Configure OBI to use the mock IMDS by setting the EC2 metadata endpoint o := obi{ Env: []string{ + `OTEL_EBPF_PROMETHEUS_PORT=8999`, "OTEL_EBPF_OPEN_PORT=8080", // Configure AWS SDK to use custom endpoint for EC2 metadata // The official amazon-ec2-metadata-mock runs on port 1338 @@ -79,26 +78,44 @@ func TestClusterName(t *testing.T) { if !KernelLockdownMode() { o.SecurityConfigSuffix = "_none" } - o.instrument(t, network, testserver, "obi-config-aws.yml") + o.instrument(t, network, testserver, "obi-config.yml") - t.Run("Cluster name from EC2 metadata", func(t *testing.T) { - // Wait for test components to be ready - waitForTestComponents(t, "http://localhost:8080") + // Wait for test components to be ready + waitForTestComponents(t, "http://localhost:8080") - // Make some requests to generate metrics - for range 4 { - ti.DoHTTPGet(t, "http://localhost:8080/rolldice", 200) - } + // Make some requests to generate metrics + for range 4 { + ti.DoHTTPGet(t, "http://localhost:8080/rolldice", 200) + } - // Query Prometheus for target_info with cluster_name attribute - pq := promtest.Client{HostPort: prometheusHostPort} + // Query Prometheus for target_info with cluster_name attribute + pq := promtest.Client{HostPort: prometheusHostPort} - // Check that the cluster_name appears in the target_info metric - require.EventuallyWithT(t, func(ct *assert.CollectT) { - query := fmt.Sprintf(`target_info{k8s_cluster_name="%s"}`, clusterName) - results, err := pq.Query(query) - require.NoError(ct, err, "failed to query Prometheus") - assert.NotEmpty(ct, results, "target_info with k8s_cluster_name should exist") - }, testTimeout, 500*time.Millisecond) + t.Run("OTEL metrics exported", func(t *testing.T) { + testMetrics(t, pq, "rolldice", "otel") + }) + t.Run("Prometheus metrics exported", func(t *testing.T) { + testMetrics(t, pq, "rolldice", "prometheus") }) } + +func testMetrics(t *testing.T, pq promtest.Client, serviceName, exporter string) { + require.EventuallyWithT(t, func(ct *assert.CollectT) { + // attribute values taken from aws-metadata-mock.json + query := `target_info{` + + `service_name="` + serviceName + `",` + + `exported="` + exporter + `",` + + `cloud_account_id="0123456789",` + + `cloud_availability_zone="us-east-1f",` + + `cloud_platform="aws_ec2",` + + `cloud_provider="aws",` + + `cloud_region="us-east-1",` + + `host_id="i-1234567890abcdef0",` + + `host_image_id="ami-0b69ea66ff7391e80",` + + `host_type="m4.xlarge"` + + `}` + results, err := pq.Query(query) + require.NoError(ct, err, "failed to query metrics") + assert.NotEmpty(ct, results, "target_info with cloud metadata should exist") + }, testTimeout, 500*time.Millisecond) +} diff --git a/internal/test/integration/configs/obi-config-dual.yml b/internal/test/integration/configs/obi-config-dual.yml new file mode 100644 index 0000000000..871a1adb26 --- /dev/null +++ b/internal/test/integration/configs/obi-config-dual.yml @@ -0,0 +1,25 @@ +# OBI exporting metrics both via OTLP and Prometheus +routes: + patterns: + - /basic/:rnd + unmatched: path + ignored_patterns: + - /metrics + ignore_mode: traces +prometheus_export: + port: 8999 + features: + - application +otel_metrics_export: + endpoint: http://otelcol:4318 +otel_traces_export: + endpoint: http://jaeger:4318 + otel_sdk_log_level: debug +attributes: + kubernetes: + cluster_name: obi-k8s-test-cluster + resource_labels: + deployment.environment: ["deployment.environment"] + select: + "*": + include: ["*"] diff --git a/internal/test/integration/configs/prometheus-config-perapp.yml b/internal/test/integration/configs/prometheus-config-perapp.yml index 1bc7b92f4b..4c96a28c91 100644 --- a/internal/test/integration/configs/prometheus-config-perapp.yml +++ b/internal/test/integration/configs/prometheus-config-perapp.yml @@ -21,4 +21,5 @@ scrape_configs: - labels: exported: "prometheus" targets: + - 'obi:8999' - '172.17.0.1:8999' diff --git a/internal/test/integration/dockerutil_test.go b/internal/test/integration/dockerutil_test.go index b3415f13ee..996316f3bf 100644 --- a/internal/test/integration/dockerutil_test.go +++ b/internal/test/integration/dockerutil_test.go @@ -162,7 +162,6 @@ func (o obi) instrument(t *testing.T, network *dockertest.Network, resource *doc obi, err := dockerPool.RunWithOptions(&dockertest.RunOptions{ Repository: "hatest-obi", Name: fmt.Sprintf("obi-otel-test-%d", time.Now().UnixNano()), - Networks: []*dockertest.Network{network}, Cmd: []string{ "--config=/configs/" + configFile, }, @@ -197,8 +196,19 @@ func (o obi) instrument(t *testing.T, network *dockertest.Network, resource *doc hc.PidMode = "container:" + resource.Container.ID }) require.NoError(t, err, "could not start OBI container") + + err = dockerPool.Client.ConnectNetwork(network.Network.ID, docker.NetworkConnectionOptions{ + Container: obi.Container.ID, + EndpointConfig: &docker.EndpointConfig{ + Aliases: []string{"obi"}, + }, + }) + require.NoError(t, err, "could not attach OBI to network") + t.Cleanup(func() { - require.NoError(t, dockerPool.Purge(obi), "could not remove OBI container") + if err := dockerPool.Purge(obi); err != nil { + t.Logf("could not remove OBI container: %v", err) + } }) t.Log("OBI container started") } From 65a008dc2791e6958bfa4a0c3ae612e0ba4831fd Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Tue, 17 Feb 2026 18:28:05 +0100 Subject: [PATCH 11/17] AWS traces metadata tests --- internal/test/integration/aws_test.go | 44 ++++++++++++++++++-- internal/test/integration/dockerutil_test.go | 2 +- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/internal/test/integration/aws_test.go b/internal/test/integration/aws_test.go index 97d020764b..c1e0d56750 100644 --- a/internal/test/integration/aws_test.go +++ b/internal/test/integration/aws_test.go @@ -4,7 +4,9 @@ package integration import ( + "encoding/json" "fmt" + "net/http" "testing" "time" @@ -13,6 +15,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.opentelemetry.io/obi/internal/test/integration/components/jaeger" "go.opentelemetry.io/obi/internal/test/integration/components/promtest" ti "go.opentelemetry.io/obi/pkg/test/integration" ) @@ -47,7 +50,7 @@ func setupMockIMDS(t *testing.T, network *dockertest.Network) { }, }) require.NoError(t, err, "could not connect AWS EC2 Metadata Mock container to network") - t.Log("AWS EC2 Metadata Mock container started") + t.Log("AWS EC2 Metadata Mock container started", "state", mockIMDS.Container.State.Status) } // This file contains tests related with the integration with Amazon Web Services @@ -91,12 +94,15 @@ func TestCloudResourceMetadata(t *testing.T) { // Query Prometheus for target_info with cluster_name attribute pq := promtest.Client{HostPort: prometheusHostPort} - t.Run("OTEL metrics exported", func(t *testing.T) { + t.Run("OTEL metrics", func(t *testing.T) { testMetrics(t, pq, "rolldice", "otel") }) - t.Run("Prometheus metrics exported", func(t *testing.T) { + t.Run("Prometheus metrics", func(t *testing.T) { testMetrics(t, pq, "rolldice", "prometheus") }) + t.Run("OTEL traces", func(t *testing.T) { + testTraces(t) + }) } func testMetrics(t *testing.T, pq promtest.Client, serviceName, exporter string) { @@ -119,3 +125,35 @@ func testMetrics(t *testing.T, pq promtest.Client, serviceName, exporter string) assert.NotEmpty(ct, results, "target_info with cloud metadata should exist") }, testTimeout, 500*time.Millisecond) } + +func testTraces(t *testing.T) { + var trace jaeger.Trace + require.EventuallyWithT(t, func(ct *assert.CollectT) { + resp, err := http.Get(jaegerQueryURL + "?service=rolldice&operation=GET%20%2Frolldice") + require.NoError(ct, err) + if resp == nil { + return + } + require.Equal(ct, http.StatusOK, resp.StatusCode) + var tq jaeger.TracesQuery + require.NoError(ct, json.NewDecoder(resp.Body).Decode(&tq)) + traces := tq.FindBySpan(jaeger.Tag{Key: "url.path", Type: "string", Value: "/rolldice"}) + require.NotEmpty(ct, traces) + trace = traces[0] + require.Len(ct, trace.Spans, 3) // parent - in queue - processing + }, testTimeout, 100*time.Millisecond) + + for _, proc := range trace.Processes { + sd := jaeger.DiffAsRegexp([]jaeger.Tag{ + {Key: "cloud.account.id", Type: "string", Value: "^0123456789$"}, + {Key: "cloud.availability_zone", Type: "string", Value: "^us-east-1f$"}, + {Key: "cloud.platform", Type: "string", Value: "^aws_ec2$"}, + {Key: "cloud.provider", Type: "string", Value: "^aws$"}, + {Key: "cloud.region", Type: "string", Value: "^us-east-1$"}, + {Key: "host.id", Type: "string", Value: "^i-1234567890abcdef0$"}, + {Key: "host.image.id", Type: "string", Value: "^ami-0b69ea66ff7391e80$"}, + {Key: "host.type", Type: "string", Value: "^m4.xlarge$"}, + }, proc.Tags) + require.Empty(t, sd) + } +} diff --git a/internal/test/integration/dockerutil_test.go b/internal/test/integration/dockerutil_test.go index 996316f3bf..c0af8cb25b 100644 --- a/internal/test/integration/dockerutil_test.go +++ b/internal/test/integration/dockerutil_test.go @@ -37,7 +37,7 @@ func setupDockerNetwork(t *testing.T) *dockertest.Network { } // setupContainerPrometheus starts a Prometheus container for metrics scraping. -func setupContainerPrometheus(t *testing.T, network *dockertest.Network, configFile string) { //nolint:unparam // configFile is always passed in current usages but may vary in future +func setupContainerPrometheus(t *testing.T, network *dockertest.Network, configFile string) { t.Helper() t.Log("Starting Prometheus container...") From ea7514699be756f18db58220b5911f9f324d7f05 Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Thu, 19 Feb 2026 09:17:17 +0100 Subject: [PATCH 12/17] fix some debug comments --- pkg/appolly/meta/meta_node_cloud.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/appolly/meta/meta_node_cloud.go b/pkg/appolly/meta/meta_node_cloud.go index 4e875b71dc..c1414a0514 100644 --- a/pkg/appolly/meta/meta_node_cloud.go +++ b/pkg/appolly/meta/meta_node_cloud.go @@ -23,9 +23,9 @@ func otelNodeFetcher(detector resource.Detector) fetcher { ctx, cancel := context.WithTimeout(ctx, connectionTimeout) defer cancel() resource, err := detector.Detect(ctx) - // none of the errors from the ec2 detect are retriable, so we just log them. + // none of the errors from the detector are retriable, so we just log them. if err != nil { - log.Debug("failed to detect AWS EC2 metadata", "error", err) + log.Debug("can't detect Cloud metadata", "error", err) } if resource == nil { return NodeMeta{}, nil @@ -40,7 +40,7 @@ func otelNodeFetcher(detector resource.Detector) fetcher { store.HostID = at.Value.Emit() case semconv.OSTypeKey: // we ignore some values that are explicitly added in the - // exporters and would cause attributes duplication (panic) + // exporters and would cause attribute duplication (panic) default: store.Metadata = append(store.Metadata, Entry{Key: attr.Name(at.Key), Value: at.Value.Emit()}) From a3c93c7c41a48072f42d07cc85efe8cc555933bf Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Thu, 19 Feb 2026 10:49:58 +0100 Subject: [PATCH 13/17] test: remove cloud fetchers --- pkg/appolly/meta/meta_node.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pkg/appolly/meta/meta_node.go b/pkg/appolly/meta/meta_node.go index 3f87d9e3b4..88b33ec058 100644 --- a/pkg/appolly/meta/meta_node.go +++ b/pkg/appolly/meta/meta_node.go @@ -11,10 +11,6 @@ import ( "sync" "time" - "go.opentelemetry.io/contrib/detectors/aws/ec2/v2" - "go.opentelemetry.io/contrib/detectors/azure/azurevm" - "go.opentelemetry.io/contrib/detectors/gcp" - attr "go.opentelemetry.io/obi/pkg/export/attributes/names" "go.opentelemetry.io/obi/pkg/kube" ) @@ -73,10 +69,10 @@ func NewNodeMeta( // will retrieve also host attributes that will be merged // in order of the priority below (the later the highest) linuxLocalFetcher, - kubeNodeFetcher(kubeInformer), - otelNodeFetcher(azurevm.New()), - otelNodeFetcher(gcp.NewDetector()), - otelNodeFetcher(ec2.NewResourceDetector()), + // kubeNodeFetcher(kubeInformer), + // otelNodeFetcher(azurevm.New()), + // otelNodeFetcher(gcp.NewDetector()), + // otelNodeFetcher(ec2.NewResourceDetector()), func(_ context.Context) (NodeMeta, error) { return NodeMeta{HostID: overrideHost}, nil }, From 4c560568240dbe8588be9df017b65760003ef115 Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Thu, 19 Feb 2026 11:07:17 +0100 Subject: [PATCH 14/17] experiment: unblock kube node fetcher --- pkg/appolly/meta/meta_node.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/appolly/meta/meta_node.go b/pkg/appolly/meta/meta_node.go index 88b33ec058..81aa0339a4 100644 --- a/pkg/appolly/meta/meta_node.go +++ b/pkg/appolly/meta/meta_node.go @@ -69,7 +69,7 @@ func NewNodeMeta( // will retrieve also host attributes that will be merged // in order of the priority below (the later the highest) linuxLocalFetcher, - // kubeNodeFetcher(kubeInformer), + kubeNodeFetcher(kubeInformer), // otelNodeFetcher(azurevm.New()), // otelNodeFetcher(gcp.NewDetector()), // otelNodeFetcher(ec2.NewResourceDetector()), From 59fa9fd86074e592011cac5115507d57e5fa7ebf Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Thu, 19 Feb 2026 11:36:16 +0100 Subject: [PATCH 15/17] experiment: enable cloud meta but disable kube node fetcher --- pkg/appolly/meta/meta_node.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pkg/appolly/meta/meta_node.go b/pkg/appolly/meta/meta_node.go index 81aa0339a4..ad1b7925f8 100644 --- a/pkg/appolly/meta/meta_node.go +++ b/pkg/appolly/meta/meta_node.go @@ -11,6 +11,10 @@ import ( "sync" "time" + "go.opentelemetry.io/contrib/detectors/aws/ec2/v2" + "go.opentelemetry.io/contrib/detectors/azure/azurevm" + "go.opentelemetry.io/contrib/detectors/gcp" + attr "go.opentelemetry.io/obi/pkg/export/attributes/names" "go.opentelemetry.io/obi/pkg/kube" ) @@ -69,10 +73,10 @@ func NewNodeMeta( // will retrieve also host attributes that will be merged // in order of the priority below (the later the highest) linuxLocalFetcher, - kubeNodeFetcher(kubeInformer), - // otelNodeFetcher(azurevm.New()), - // otelNodeFetcher(gcp.NewDetector()), - // otelNodeFetcher(ec2.NewResourceDetector()), + // kubeNodeFetcher(kubeInformer), + otelNodeFetcher(azurevm.New()), + otelNodeFetcher(gcp.NewDetector()), + otelNodeFetcher(ec2.NewResourceDetector()), func(_ context.Context) (NodeMeta, error) { return NodeMeta{HostID: overrideHost}, nil }, From 8c6ddb004975416ebafccb95aa52dad3bb09a006 Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Thu, 19 Feb 2026 12:04:14 +0100 Subject: [PATCH 16/17] Avoid activating kubernetes in non-k8s tests --- .../configs/obi-config-discovery.yml | 1 - .../integration/configs/obi-config-dual.yml | 25 ---------------- .../configs/obi-config-error-test.yml | 3 -- .../configs/obi-config-multiexec-host.yml | 1 - .../configs/obi-config-multiexec-lang.yml | 1 - .../configs/obi-config-multiexec.yml | 1 - .../configs/obi-config-nodemultiproc.yml | 1 - .../configs/obi-config-sampler.yml | 1 - pkg/appolly/meta/meta_node.go | 2 +- pkg/appolly/meta/meta_node_cloud.go | 30 ++++++++++++++----- 10 files changed, 24 insertions(+), 42 deletions(-) delete mode 100644 internal/test/integration/configs/obi-config-dual.yml diff --git a/internal/test/integration/configs/obi-config-discovery.yml b/internal/test/integration/configs/obi-config-discovery.yml index c3eaaf4052..ea93d95793 100644 --- a/internal/test/integration/configs/obi-config-discovery.yml +++ b/internal/test/integration/configs/obi-config-discovery.yml @@ -53,7 +53,6 @@ discovery: exports: [metrics, traces] attributes: kubernetes: - enable: true cluster_name: my-kube select: http_server_request_duration_seconds_count: diff --git a/internal/test/integration/configs/obi-config-dual.yml b/internal/test/integration/configs/obi-config-dual.yml deleted file mode 100644 index 871a1adb26..0000000000 --- a/internal/test/integration/configs/obi-config-dual.yml +++ /dev/null @@ -1,25 +0,0 @@ -# OBI exporting metrics both via OTLP and Prometheus -routes: - patterns: - - /basic/:rnd - unmatched: path - ignored_patterns: - - /metrics - ignore_mode: traces -prometheus_export: - port: 8999 - features: - - application -otel_metrics_export: - endpoint: http://otelcol:4318 -otel_traces_export: - endpoint: http://jaeger:4318 - otel_sdk_log_level: debug -attributes: - kubernetes: - cluster_name: obi-k8s-test-cluster - resource_labels: - deployment.environment: ["deployment.environment"] - select: - "*": - include: ["*"] diff --git a/internal/test/integration/configs/obi-config-error-test.yml b/internal/test/integration/configs/obi-config-error-test.yml index ffc71b3b1b..56317ae8b6 100644 --- a/internal/test/integration/configs/obi-config-error-test.yml +++ b/internal/test/integration/configs/obi-config-error-test.yml @@ -13,9 +13,6 @@ discovery: name: testserver open_ports: 8080 exe_path: "testserver" -attributes: - kubernetes: - enable: true # Enable context propagation to trigger cgroup operations that may fail ebpf: context_propagation: all diff --git a/internal/test/integration/configs/obi-config-multiexec-host.yml b/internal/test/integration/configs/obi-config-multiexec-host.yml index ff9b6820c5..42039f736d 100644 --- a/internal/test/integration/configs/obi-config-multiexec-host.yml +++ b/internal/test/integration/configs/obi-config-multiexec-host.yml @@ -49,7 +49,6 @@ discovery: exports: [] # test exports field, do not export docker-proxy metrics or traces attributes: kubernetes: - enable: true cluster_name: my-kube select: http_server_request_duration_seconds_count: diff --git a/internal/test/integration/configs/obi-config-multiexec-lang.yml b/internal/test/integration/configs/obi-config-multiexec-lang.yml index 238fd735e1..cf2b0719fa 100644 --- a/internal/test/integration/configs/obi-config-multiexec-lang.yml +++ b/internal/test/integration/configs/obi-config-multiexec-lang.yml @@ -14,7 +14,6 @@ discovery: - exe_path: "{obi,prometheus,otelcol*,all*,launcher}" attributes: kubernetes: - enable: true cluster_name: my-kube select: http_server_request_duration_seconds_count: diff --git a/internal/test/integration/configs/obi-config-multiexec.yml b/internal/test/integration/configs/obi-config-multiexec.yml index 19c3197de7..2211442da2 100644 --- a/internal/test/integration/configs/obi-config-multiexec.yml +++ b/internal/test/integration/configs/obi-config-multiexec.yml @@ -48,7 +48,6 @@ discovery: open_ports: 8090 attributes: kubernetes: - enable: true cluster_name: my-kube select: http_server_request_duration_seconds_count: diff --git a/internal/test/integration/configs/obi-config-nodemultiproc.yml b/internal/test/integration/configs/obi-config-nodemultiproc.yml index 6aa9977548..cf387ae23c 100644 --- a/internal/test/integration/configs/obi-config-nodemultiproc.yml +++ b/internal/test/integration/configs/obi-config-nodemultiproc.yml @@ -22,7 +22,6 @@ discovery: open_ports: 5003 attributes: kubernetes: - enable: true cluster_name: my-kube select: http_server_request_duration_seconds_count: diff --git a/internal/test/integration/configs/obi-config-sampler.yml b/internal/test/integration/configs/obi-config-sampler.yml index d158dbc77a..c53d25803a 100644 --- a/internal/test/integration/configs/obi-config-sampler.yml +++ b/internal/test/integration/configs/obi-config-sampler.yml @@ -22,7 +22,6 @@ discovery: arg: "0.5" attributes: kubernetes: - enable: true cluster_name: my-kube select: http_server_request_duration_seconds_count: diff --git a/pkg/appolly/meta/meta_node.go b/pkg/appolly/meta/meta_node.go index ad1b7925f8..3f87d9e3b4 100644 --- a/pkg/appolly/meta/meta_node.go +++ b/pkg/appolly/meta/meta_node.go @@ -73,7 +73,7 @@ func NewNodeMeta( // will retrieve also host attributes that will be merged // in order of the priority below (the later the highest) linuxLocalFetcher, - // kubeNodeFetcher(kubeInformer), + kubeNodeFetcher(kubeInformer), otelNodeFetcher(azurevm.New()), otelNodeFetcher(gcp.NewDetector()), otelNodeFetcher(ec2.NewResourceDetector()), diff --git a/pkg/appolly/meta/meta_node_cloud.go b/pkg/appolly/meta/meta_node_cloud.go index c1414a0514..ba6707fff2 100644 --- a/pkg/appolly/meta/meta_node_cloud.go +++ b/pkg/appolly/meta/meta_node_cloud.go @@ -22,15 +22,31 @@ func otelNodeFetcher(detector resource.Detector) fetcher { // we expect very short response time in a cloud environment ctx, cancel := context.WithTimeout(ctx, connectionTimeout) defer cancel() - resource, err := detector.Detect(ctx) - // none of the errors from the detector are retriable, so we just log them. - if err != nil { - log.Debug("can't detect Cloud metadata", "error", err) - } - if resource == nil { + // running asynchronously to avoid that any connection issue blocks the main goroutine + resCh := make(chan *resource.Resource, 1) + go func() { + resource, err := detector.Detect(ctx) + // none of the errors from the detector are retriable, so we just log them. + if err != nil { + log.Debug("can't detect Cloud metadata", "error", err) + } + resCh <- resource + }() + + var resource *resource.Resource + select { + case resource = <-resCh: + if resource == nil { + // everything is fine, we might have asked for a Cloud resource from a baremetal machine + return NodeMeta{}, nil + } + case <-ctx.Done(): + log.Warn("timed out while waiting for Cloud metadata. Ignoring") return NodeMeta{}, nil } - // In any case, the API can return an error with a valid (partial resource) + + log.Info("detected Cloud metadata") + // In some cases, the API can return an error with a valid (partial resource) attrs := resource.Iter() store := NodeMeta{Metadata: make([]Entry, 0, attrs.Len())} for attrs.Next() { From 9e9fc7affa83cbae9897902ef77b43ee31aad51d Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Thu, 19 Feb 2026 12:15:55 +0100 Subject: [PATCH 17/17] add timeout to kube metadata retrieval --- pkg/appolly/meta/meta_node_kube.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/appolly/meta/meta_node_kube.go b/pkg/appolly/meta/meta_node_kube.go index 8fe87ba5fd..c6aced357e 100644 --- a/pkg/appolly/meta/meta_node_kube.go +++ b/pkg/appolly/meta/meta_node_kube.go @@ -6,17 +6,23 @@ package meta // import "go.opentelemetry.io/obi/pkg/appolly/meta" import ( "context" "fmt" + "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "go.opentelemetry.io/obi/pkg/kube" ) +const kubeTimeout = 30 * time.Second + func kubeNodeFetcher(k8sInformer *kube.MetadataProvider) fetcher { return func(ctx context.Context) (NodeMeta, error) { if !k8sInformer.IsKubeEnabled() { return NodeMeta{}, nil } + ctx, cancel := context.WithTimeout(ctx, kubeTimeout) + defer cancel() + nodeName, err := k8sInformer.CurrentNodeName(ctx) if err != nil { // forwarding an error will force the NodeMeta to