diff --git a/.chloggen/spanpruningprocessor-impl.yaml b/.chloggen/spanpruningprocessor-impl.yaml new file mode 100644 index 0000000000000..33185f61349fd --- /dev/null +++ b/.chloggen/spanpruningprocessor-impl.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. receiver/filelog) +component: processor/spanpruning + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add full implementation of the span pruning processor. + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [45654] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/processor/spanpruningprocessor/README.md b/processor/spanpruningprocessor/README.md index 72c0cbcc5204d..5f7af4e1ca40f 100644 --- a/processor/spanpruningprocessor/README.md +++ b/processor/spanpruningprocessor/README.md @@ -1,12 +1,294 @@ - # Span Pruning Processor + + | Status | | | ------------- |-----------| -| Stability | [development]: traces | -| Distributions | [] | +| Stability | [alpha]: traces | +| Distributions | [contrib] | | Issues | [![Open issues](https://img.shields.io/github/issues-search/open-telemetry/opentelemetry-collector-contrib?query=is%3Aissue%20is%3Aopen%20label%3Aprocessor%2Fspanpruning%20&label=open&color=orange&logo=opentelemetry)](https://github.com/open-telemetry/opentelemetry-collector-contrib/issues?q=is%3Aopen+is%3Aissue+label%3Aprocessor%2Fspanpruning) [![Closed issues](https://img.shields.io/github/issues-search/open-telemetry/opentelemetry-collector-contrib?query=is%3Aissue%20is%3Aclosed%20label%3Aprocessor%2Fspanpruning%20&label=closed&color=blue&logo=opentelemetry)](https://github.com/open-telemetry/opentelemetry-collector-contrib/issues?q=is%3Aclosed+is%3Aissue+label%3Aprocessor%2Fspanpruning) | | Code coverage | [![codecov](https://codecov.io/github/open-telemetry/opentelemetry-collector-contrib/graph/main/badge.svg?component=processor_spanpruning)](https://app.codecov.io/gh/open-telemetry/opentelemetry-collector-contrib/tree/main/?components%5B0%5D=processor_spanpruning&displayType=list) | | [Code Owners](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/CONTRIBUTING.md#becoming-a-code-owner) | [@portertech](https://www.github.com/portertech) | -[development]: https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/component-stability.md#development +[alpha]: https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/component-stability.md#alpha +[contrib]: https://github.com/open-telemetry/opentelemetry-collector-releases/tree/main/distributions/otelcol-contrib + +## Overview + +The Span Pruning Processor identifies duplicate or similar leaf spans within a single trace, groups them, and replaces each group with a single aggregated summary span. When leaf spans are aggregated, the processor also recursively aggregates their parent spans if all children of those parents are being aggregated. + +**Leaf spans** are spans that are not referenced as a parent by any other span in the trace. They typically represent the last actions in an execution call stack (e.g., individual database queries, HTTP calls to external services). + +Spans are grouped by: +1. **Span name** - spans must have the same name +2. **Span kind** - spans must have the same kind (Internal, Server, Client, Producer, Consumer) +3. **Status code** - spans must have the same status (OK, Error, or Unset) +4. **TraceState** - spans must have identical TraceState values (for Consistent Probability Sampling compatibility) +5. **Configured attributes** - spans must have matching values for attributes specified in `group_by_attributes` +6. **Parent span name** - leaf spans must share the same parent span name to be grouped together + +Parent spans are eligible for aggregation when all of their children are aggregated, they share the same name, kind, and status code, and they are not root spans. + +This processor is useful for reducing trace data volume while preserving meaningful information about repeated operations. + +## Use Cases + +- **Database query optimization**: When an application makes many similar database queries (e.g., N+1 queries), aggregate them into a single summary span +- **Batch operations**: Consolidate many similar leaf operations into a single representative span +- **Cost reduction**: Reduce trace storage costs by eliminating redundant span data + +## Configuration + +```yaml +processors: + spanpruning: + # Attributes to use for grouping similar leaf spans (supports glob patterns) + # Spans with the same name AND same values for matching attributes will be grouped + # Examples: + # - "db.*" matches db.operation, db.name, db.statement, etc. + # - "http.request.*" matches http.request.method, http.request.header, etc. + # - "db.operation" matches only the exact key "db.operation" + group_by_attributes: + - "db.*" + - "http.method" + + # Minimum number of similar leaf spans required before aggregation + # Default: 5 + min_spans_to_aggregate: 3 + + # Maximum depth of parent span aggregation above leaf spans + # 0 = only aggregate leaf spans (no parent aggregation) + # -1 = unlimited depth + # Default: 1 + max_parent_depth: 1 + + # Prefix for aggregation statistics attributes + # Default: "aggregation." + aggregation_attribute_prefix: "batch." +``` + +## Configuration Options + +| Field | Type | Default | Description | +|-----|--|---------|-------| +| `group_by_attributes` | []string | [] | Attribute patterns for grouping (supports glob patterns like `db.*`) | +| `min_spans_to_aggregate` | int | 5 | Minimum group size before aggregation occurs | +| `max_parent_depth` | int | 1 | Max depth of parent aggregation (0=none, -1=unlimited) | +| `aggregation_attribute_prefix` | string | "aggregation." | Prefix for aggregation statistics attributes | + +### Glob Pattern Support + +The `group_by_attributes` field supports glob patterns for matching attribute keys: + +| Pattern | Matches | +|-----|--| +| `db.*` | `db.operation`, `db.name`, `db.statement`, etc. | +| `http.request.*` | `http.request.method`, `http.request.header.content-type`, etc. | +| `rpc.*` | `rpc.method`, `rpc.service`, `rpc.system`, etc. | +| `db.operation` | Only the exact key `db.operation` | + +When multiple attributes match a pattern, they are all included in the grouping key (sorted alphabetically for consistency). + +## Summary Span + +When spans are aggregated, the summary span includes: + +### Properties +- **Name**: Original span name (e.g., `SELECT`) +- **TraceID**: Same as original spans +- **SpanID**: Newly generated unique ID +- **ParentSpanID**: Same as original spans (common parent) +- **Kind**: Same as template span (inherited from slowest span) +- **StartTimestamp**: Earliest start time of all spans in the group +- **EndTimestamp**: Latest end time of all spans in the group +- **Status**: Same as original spans (spans are grouped by status code) +- **TraceState**: Inherited from the template span (preserved for Consistent Probability Sampling compatibility) +- **Attributes**: Inherited from the slowest span in the group + +> **Note**: The summary span's duration (`EndTimestamp - StartTimestamp`) represents the total time window covered by all aggregated spans, which may exceed `duration_max_ns`. For example, if spans overlap or are staggered, the time range can be larger than any individual span's duration. Use `duration_max_ns` to find the slowest individual operation. + +### What Gets Aggregated Away + +When spans are aggregated into a summary span, the following data from non-template spans is **lost**: + +| Data | Behavior | +|------|----------| +| **Span Events** | Events from the template (slowest) span are preserved | +| **Span Links** | Links from the template span are preserved | +| **Attributes** | Non-matching attribute values are lost | +| **Individual Timestamps** | Original start/end times replaced by the group's time range | +| **SpanIDs** | Original SpanIDs are replaced by a single summary SpanID | + +### Aggregation Attributes +The following attributes are added to the summary span (shown with default `aggregation_attribute_prefix: "aggregation."`): + +| Attribute | Type | Description | +|-----------|------|-------------| +| `is_summary` | bool | Always `true` to identify summary spans | +| `span_count` | int64 | Number of spans that were aggregated | +| `duration_min_ns` | int64 | Minimum duration in nanoseconds | +| `duration_max_ns` | int64 | Maximum duration in nanoseconds | +| `duration_avg_ns` | int64 | Average duration in nanoseconds | +| `duration_total_ns` | int64 | Total duration in nanoseconds | + +## Pipeline Placement + +This processor is designed to work best when placed after processors that ensure complete traces are available: + +```yaml +service: + pipelines: + traces: + receivers: [otlp] + processors: [groupbytrace, spanpruning, batch] + exporters: [otlp] +``` + +Or with tail sampling: + +```yaml +service: + pipelines: + traces: + receivers: [otlp] + processors: [tail_sampling, spanpruning, batch] + exporters: [otlp] +``` + +## Example + +### Basic Example + +A trace with repeated database queries (some failing): + +**Before Processing:** +``` +root-span (parent) +├── SELECT (leaf) - duration: 10ms, db.operation: select, status: OK +├── SELECT (leaf) - duration: 15ms, db.operation: select, status: OK +├── SELECT (leaf) - duration: 12ms, db.operation: select, status: OK +├── SELECT (leaf) - duration: 50ms, db.operation: select, status: Error +├── SELECT (leaf) - duration: 45ms, db.operation: select, status: Error +└── INSERT (leaf) - duration: 20ms, db.operation: insert, status: OK +``` + +**After Processing (with `min_spans_to_aggregate: 2`):** +``` +root-span (parent) +├── SELECT (summary, status: OK) +│ - aggregation.is_summary: true +│ - aggregation.span_count: 3 +│ - aggregation.duration_min_ns: 10000000 +│ - aggregation.duration_max_ns: 15000000 +│ - aggregation.duration_avg_ns: 12333333 +├── SELECT (summary, status: Error) +│ - aggregation.is_summary: true +│ - aggregation.span_count: 2 +│ - aggregation.duration_min_ns: 45000000 +│ - aggregation.duration_max_ns: 50000000 +│ - aggregation.duration_avg_ns: 47500000 +└── INSERT (unchanged - only 1 span, below threshold) +``` + +Note: Spans with different status codes are grouped separately, preserving error information. + +### Recursive Parent Aggregation Example + +When spans are aggregated, the processor also checks if their parent spans can be aggregated. Parent spans are eligible for aggregation when: +1. All of their children are being aggregated +2. They share the same name, kind, and status code with other eligible parents +3. They are not root spans (must have a parent) +4. At least 2 parents meet the criteria + +**Before Processing (with `min_spans_to_aggregate: 2`, `group_by_attributes: ["db.op"]`):** +``` +root +├── handler (status: OK) +│ └── SELECT (db.op=select, status: OK) ───┐ +├── handler (status: OK) │ leaf group A: 3 OK SELECTs +│ └── SELECT (db.op=select, status: OK) ───┤ +├── handler (status: OK) │ +│ └── SELECT (db.op=select, status: OK) ───┘ +├── handler (status: Error) +│ └── SELECT (db.op=select, status: Error) ┐ leaf group B: 2 Error SELECTs +├── handler (status: Error) │ +│ └── SELECT (db.op=select, status: Error) ┘ +├── handler (status: OK) +│ └── INSERT (db.op=insert, status: OK) ──── only 1, below threshold +└── worker (status: OK) + └── SELECT (db.op=select, status: OK) ──── different parent name +``` + +**After Processing:** +``` +root +├── handler (summary, status: OK, span_count: 3) +│ └── SELECT (summary, status: OK, span_count: 3) +├── handler (summary, status: Error, span_count: 2) +│ └── SELECT (summary, status: Error, span_count: 2) +├── handler (status: OK) +│ └── INSERT (status: OK) ─────────────────────────── unchanged +└── worker (status: OK) + └── SELECT (status: OK) ─────────────────────────── unchanged +``` + +**Why each span was handled this way:** + +| Span | Result | Reason | +|------|--------|--------| +| 3x handler (OK) with SELECT children | Aggregated | All children aggregated, same name+kind+status | +| 3x SELECT (OK) under handler | Aggregated | Same name + kind + status + attributes + parent name | +| 2x handler (Error) with SELECT children | Aggregated | All children aggregated, same name+kind+status | +| 2x SELECT (Error) under handler | Aggregated | Same name + kind + status + attributes + parent name | +| handler (OK) with INSERT child | Unchanged | Child not aggregated (only 1 INSERT) | +| INSERT (OK) | Unchanged | Below threshold (only 1 span) | +| worker (OK) | Unchanged | Child not aggregated | +| SELECT (OK) under worker | Unchanged | Different parent name than other SELECTs | + +## Limitations + +- Requires complete traces for accurate leaf detection +- Summary span inherits attributes from the slowest span in the group +- Parent spans are only aggregated when ALL their children are aggregated + +## Consistent Probability Sampling (CPS) Compatibility + +The processor is designed to be compatible with [Consistent Probability Sampling](https://opentelemetry.io/docs/specs/otel/trace/tracestate-probability-sampling/) (CPS). CPS uses TraceState to carry sampling metadata (`ot=th:...;rv:...`) where: + +- `th` (threshold) indicates the sampling probability threshold +- `rv` (randomness value) provides consistent randomness for sampling decisions + +**Why TraceState matters for aggregation:** + +Spans with different TraceState values represent different sampling populations with different "adjusted counts" (weights). Aggregating them together would produce statistically incorrect summaries and break downstream sampling decisions. + +The processor uses **exact TraceState matching** (not just the `th` value) because: +- The `rv` value affects sampling decisions +- Vendor-specific keys may have semantic meaning +- Key ordering may be significant + +## Telemetry + +The processor emits the following metrics to help monitor its operation: + +### Counters + +| Metric | Description | +|--------|-------------| +| `otelcol_processor_spanpruning_spans_received` | Total number of spans received by the processor | +| `otelcol_processor_spanpruning_spans_pruned` | Total number of spans removed by aggregation | +| `otelcol_processor_spanpruning_aggregations_created` | Total number of aggregation summary spans created | +| `otelcol_processor_spanpruning_traces_processed` | Total number of traces processed | + +### Histograms + +| Metric | Description | +|--------|-------------| +| `otelcol_processor_spanpruning_aggregation_group_size` | Distribution of the number of spans per aggregation group | +| `otelcol_processor_spanpruning_processing_duration` | Time taken to process each batch of traces (in seconds) | + +These metrics can be used to: +- Monitor the effectiveness of span pruning (compare `spans_received` vs `spans_pruned`) +- Track the compression ratio achieved by aggregation +- Identify processing bottlenecks via `processing_duration` +- Understand aggregation patterns via `aggregation_group_size` diff --git a/processor/spanpruningprocessor/aggregation.go b/processor/spanpruningprocessor/aggregation.go new file mode 100644 index 0000000000000..5567e9c848b17 --- /dev/null +++ b/processor/spanpruningprocessor/aggregation.go @@ -0,0 +1,178 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package spanpruningprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor" + +import ( + "encoding/binary" + "math/rand/v2" + "sort" + + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/ptrace" +) + +// aggregationGroup captures the spans to aggregate along with execution +// metadata (tree depth, preassigned summary ID). +type aggregationGroup struct { + nodes []*spanNode // nodes to aggregate (replaces []spanInfo for efficiency) + depth int // tree depth (0 = leaf, 1 = parent of leaf, etc.) + summarySpanID pcommon.SpanID // SpanID of the summary span (assigned before creation) + templateNode *spanNode // node to use as summary template (longest duration) +} + +// aggregationPlan orders aggregation groups for top-down execution and +// carries precomputed summary span IDs. +type aggregationPlan struct { + groups []aggregationGroup +} + +// findLongestDurationNode returns the node with the longest duration. +func findLongestDurationNode(nodes []*spanNode) *spanNode { + if len(nodes) == 0 { + return nil + } + longest := nodes[0] + // pcommon.Timestamp is uint64 nanoseconds; direct subtraction avoids + // creating intermediate time.Time objects (2 per span otherwise). + longestDuration := int64(longest.span.EndTimestamp()) - int64(longest.span.StartTimestamp()) + for _, node := range nodes[1:] { + duration := int64(node.span.EndTimestamp()) - int64(node.span.StartTimestamp()) + if duration > longestDuration { + longest = node + longestDuration = duration + } + } + return longest +} + +// generateSpanID produces a non-cryptographic span ID suitable for summary +// spans; uniqueness is sufficient, not randomness strength. +func generateSpanID() pcommon.SpanID { + var id [8]byte + binary.BigEndian.PutUint64(id[:], rand.Uint64()) + return pcommon.SpanID(id) +} + +// buildAggregationPlan sorts aggregation groups by depth (parents before +// children) and preassigns summary SpanIDs to avoid conflicts during writes. +func (*spanPruningProcessor) buildAggregationPlan(groups map[string]aggregationGroup) aggregationPlan { + // Convert map to slice with pre-allocation + groupSlice := make([]aggregationGroup, 0, len(groups)) + for key := range groups { + groupSlice = append(groupSlice, groups[key]) + } + + // Sort by depth descending (highest depth first = top-down) + sort.Slice(groupSlice, func(i, j int) bool { + return groupSlice[i].depth > groupSlice[j].depth + }) + + // Pre-assign SpanIDs for all summary spans + for i := range groupSlice { + groupSlice[i].summarySpanID = generateSpanID() + } + + return aggregationPlan{groups: groupSlice} +} + +// executeAggregations performs the top-down creation of summary spans, batch +// removes originals, and returns the number of pruned spans. +func (p *spanPruningProcessor) executeAggregations(plan aggregationPlan) int { + // Track which parent SpanID should map to which summary SpanID + parentReplacements := make(map[pcommon.SpanID]pcommon.SpanID, len(plan.groups)*4) + + // Track spans to remove per ScopeSpans for batch removal + spansToRemove := make(map[ptrace.ScopeSpans]map[pcommon.SpanID]struct{}, len(plan.groups)) + prunedCount := 0 + + for i := range plan.groups { + group := &plan.groups[i] + // Calculate statistics and time range in single pass + data := p.calculateAggregationData(group.nodes) + + // Determine the parent SpanID for the summary span + // Use the first node's parent as template + originalParentID := group.nodes[0].span.ParentSpanID() + + // Check if the parent is being replaced by a summary span + summaryParentID := originalParentID + if replacementID, exists := parentReplacements[originalParentID]; exists { + summaryParentID = replacementID + } + + // Create summary span with correct parent + p.createSummarySpanWithParent(*group, data, summaryParentID) + + // Record that these original span IDs should be replaced by the summary span ID + for _, node := range group.nodes { + spanID := node.span.SpanID() + parentReplacements[spanID] = group.summarySpanID + scopeSpans := node.scopeSpans + if spansToRemove[scopeSpans] == nil { + spansToRemove[scopeSpans] = make(map[pcommon.SpanID]struct{}, len(group.nodes)) + } + spansToRemove[scopeSpans][spanID] = struct{}{} + } + prunedCount += len(group.nodes) + } + + // Batch remove all marked spans in a single pass per ScopeSpans + for scopeSpans, spanIDs := range spansToRemove { + scopeSpans.Spans().RemoveIf(func(span ptrace.Span) bool { + _, shouldRemove := spanIDs[span.SpanID()] + return shouldRemove + }) + } + + return prunedCount +} + +// createSummarySpanWithParent builds the summary span for an aggregation +// group, wiring it under the provided parent SpanID and attaching stats. +func (p *spanPruningProcessor) createSummarySpanWithParent(group aggregationGroup, data aggregationData, parentSpanID pcommon.SpanID) ptrace.Span { + // Use the template node (longest duration span) as a template + templateNode := group.templateNode + templateSpan := templateNode.span + scopeSpans := templateNode.scopeSpans + + // Create new span in the same ScopeSpans as the first span + newSpan := scopeSpans.Spans().AppendEmpty() + + // Copy basic properties from template + newSpan.SetName(templateSpan.Name()) + newSpan.SetTraceID(templateSpan.TraceID()) + newSpan.SetSpanID(group.summarySpanID) + newSpan.SetParentSpanID(parentSpanID) + newSpan.SetKind(templateSpan.Kind()) + + // Set timestamps from aggregation data + newSpan.SetStartTimestamp(data.earliestStart) + newSpan.SetEndTimestamp(data.latestEnd) + + // Copy attributes from template + templateSpan.Attributes().CopyTo(newSpan.Attributes()) + + // Copy status from template + templateSpan.Status().CopyTo(newSpan.Status()) + + // Copy TraceState from template for Consistent Probability Sampling compatibility + newSpan.TraceState().FromRaw(templateSpan.TraceState().AsRaw()) + + // Copy events and links from template + templateSpan.Events().CopyTo(newSpan.Events()) + templateSpan.Links().CopyTo(newSpan.Links()) + + // Add aggregation statistics as attributes + prefix := p.config.AggregationAttributePrefix + newSpan.Attributes().PutBool(prefix+"is_summary", true) + newSpan.Attributes().PutInt(prefix+"span_count", data.count) + newSpan.Attributes().PutInt(prefix+"duration_min_ns", int64(data.minDuration)) + newSpan.Attributes().PutInt(prefix+"duration_max_ns", int64(data.maxDuration)) + newSpan.Attributes().PutInt(prefix+"duration_total_ns", int64(data.sumDuration)) + if data.count > 0 { + newSpan.Attributes().PutInt(prefix+"duration_avg_ns", int64(data.sumDuration)/data.count) + } + + return newSpan +} diff --git a/processor/spanpruningprocessor/aggregation_test.go b/processor/spanpruningprocessor/aggregation_test.go new file mode 100644 index 0000000000000..55e86827e5a83 --- /dev/null +++ b/processor/spanpruningprocessor/aggregation_test.go @@ -0,0 +1,136 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package spanpruningprocessor + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/ptrace" +) + +func TestFindLongestDurationNode_Empty(t *testing.T) { + result := findLongestDurationNode(nil) + assert.Nil(t, result) + + result = findLongestDurationNode([]*spanNode{}) + assert.Nil(t, result) +} + +func TestFindLongestDurationNode_SingleNode(t *testing.T) { + nodes := createSpanNodesWithDurations(t, []int64{100}) + + result := findLongestDurationNode(nodes) + require.NotNil(t, result) + assert.Equal(t, nodes[0], result) +} + +func TestFindLongestDurationNode_LongestFirst(t *testing.T) { + // Longest duration is first in the slice + nodes := createSpanNodesWithDurations(t, []int64{500, 100, 200}) + + result := findLongestDurationNode(nodes) + require.NotNil(t, result) + assert.Equal(t, nodes[0], result, "should return first node (500ns)") +} + +func TestFindLongestDurationNode_LongestMiddle(t *testing.T) { + // Longest duration is in the middle + nodes := createSpanNodesWithDurations(t, []int64{100, 500, 200}) + + result := findLongestDurationNode(nodes) + require.NotNil(t, result) + assert.Equal(t, nodes[1], result, "should return middle node (500ns)") +} + +func TestFindLongestDurationNode_LongestLast(t *testing.T) { + // Longest duration is last in the slice + nodes := createSpanNodesWithDurations(t, []int64{100, 200, 500}) + + result := findLongestDurationNode(nodes) + require.NotNil(t, result) + assert.Equal(t, nodes[2], result, "should return last node (500ns)") +} + +func TestFindLongestDurationNode_EqualDurations(t *testing.T) { + // All durations are equal - should return first + nodes := createSpanNodesWithDurations(t, []int64{100, 100, 100}) + + result := findLongestDurationNode(nodes) + require.NotNil(t, result) + assert.Equal(t, nodes[0], result, "should return first node when all equal") +} + +func TestFindLongestDurationNode_LargeDurations(t *testing.T) { + // Test with large duration values (milliseconds in nanoseconds) + durations := []int64{ + 1_000_000, // 1ms + 500_000_000, // 500ms + 100_000_000, // 100ms + } + nodes := createSpanNodesWithDurations(t, durations) + + result := findLongestDurationNode(nodes) + require.NotNil(t, result) + assert.Equal(t, nodes[1], result, "should return node with 500ms duration") +} + +func TestFindLongestDurationNode_ZeroDuration(t *testing.T) { + // Test with zero duration spans + nodes := createSpanNodesWithDurations(t, []int64{0, 100, 0}) + + result := findLongestDurationNode(nodes) + require.NotNil(t, result) + assert.Equal(t, nodes[1], result, "should return node with non-zero duration") +} + +func TestFindLongestDurationNode_ManyNodes(t *testing.T) { + // Test with many nodes to verify iteration works correctly + durations := make([]int64, 100) + for i := range durations { + durations[i] = int64(i * 10) + } + // Set one in the middle to be the longest + durations[50] = 99999 + + nodes := createSpanNodesWithDurations(t, durations) + + result := findLongestDurationNode(nodes) + require.NotNil(t, result) + assert.Equal(t, nodes[50], result, "should return node at index 50 with longest duration") +} + +// createSpanNodesWithDurations creates span nodes with specified durations in nanoseconds +func createSpanNodesWithDurations(t *testing.T, durationsNs []int64) []*spanNode { + t.Helper() + + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + nodes := make([]*spanNode, 0, len(durationsNs)) + baseTime := int64(1000000000) + + for i, duration := range durationsNs { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("test") + span.SetStartTimestamp(pcommon.Timestamp(baseTime)) + span.SetEndTimestamp(pcommon.Timestamp(baseTime + duration)) + + nodes = append(nodes, &spanNode{ + span: span, + scopeSpans: ss, + }) + } + + return nodes +} diff --git a/processor/spanpruningprocessor/benchmark_testdata_test.go b/processor/spanpruningprocessor/benchmark_testdata_test.go new file mode 100644 index 0000000000000..631701420edb5 --- /dev/null +++ b/processor/spanpruningprocessor/benchmark_testdata_test.go @@ -0,0 +1,241 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package spanpruningprocessor + +import ( + "fmt" + "time" + + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/ptrace" +) + +// testTraceID is a fixed trace ID used across all test trace generators. +var testTraceID = pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + +// makeSpanID converts a uint64 to a SpanID. +func makeSpanID(id uint64) pcommon.SpanID { + return pcommon.SpanID([8]byte{ + byte(id >> 56), byte(id >> 48), byte(id >> 40), byte(id >> 32), + byte(id >> 24), byte(id >> 16), byte(id >> 8), byte(id), + }) +} + +// generateTestTrace creates a flat test trace: root -> parents -> leaves. +func generateTestTrace(numSpans, leafSpansPerParent int) ptrace.Traces { + td := ptrace.NewTraces() + ss := td.ResourceSpans().AppendEmpty().ScopeSpans().AppendEmpty() + + // Root span + root := ss.Spans().AppendEmpty() + root.SetTraceID(testTraceID) + root.SetSpanID(makeSpanID(1)) + root.SetName("root") + root.SetKind(ptrace.SpanKindServer) + root.SetStartTimestamp(pcommon.NewTimestampFromTime(time.Now())) + root.SetEndTimestamp(pcommon.NewTimestampFromTime(time.Now().Add(time.Second))) + root.Status().SetCode(ptrace.StatusCodeOk) + + spanID := uint64(2) + numParents := max((numSpans-1)/leafSpansPerParent, 1) + + // Parent spans + parentIDs := make([]pcommon.SpanID, 0, numParents) + for i := 0; i < numParents && spanID < uint64(numSpans); i++ { + span := ss.Spans().AppendEmpty() + span.SetTraceID(testTraceID) + id := makeSpanID(spanID) + span.SetSpanID(id) + span.SetParentSpanID(root.SpanID()) + span.SetName(fmt.Sprintf("parent-%d", i)) + span.SetKind(ptrace.SpanKindInternal) + span.SetStartTimestamp(pcommon.NewTimestampFromTime(time.Now())) + span.SetEndTimestamp(pcommon.NewTimestampFromTime(time.Now().Add(100 * time.Millisecond))) + span.Status().SetCode(ptrace.StatusCodeOk) + parentIDs = append(parentIDs, id) + spanID++ + } + + // Leaf spans + for i := 0; spanID < uint64(numSpans+1); i++ { + span := ss.Spans().AppendEmpty() + span.SetTraceID(testTraceID) + span.SetSpanID(makeSpanID(spanID)) + span.SetParentSpanID(parentIDs[i%len(parentIDs)]) + span.SetName("leaf-operation") + span.SetKind(ptrace.SpanKindClient) + span.SetStartTimestamp(pcommon.NewTimestampFromTime(time.Now())) + span.SetEndTimestamp(pcommon.NewTimestampFromTime(time.Now().Add(10 * time.Millisecond))) + span.Status().SetCode(ptrace.StatusCodeOk) + span.Attributes().PutStr("http.method", "GET") + span.Attributes().PutStr("http.url", "/api/data") + spanID++ + } + + return td +} + +// generateSparseTrace creates a trace where only a small fraction aggregates. +func generateSparseTrace(numSpans, minSpans int) ptrace.Traces { + td := ptrace.NewTraces() + ss := td.ResourceSpans().AppendEmpty().ScopeSpans().AppendEmpty() + + // Root span + root := ss.Spans().AppendEmpty() + root.SetTraceID(testTraceID) + root.SetSpanID(makeSpanID(1)) + root.SetName("root") + root.SetKind(ptrace.SpanKindServer) + root.SetStartTimestamp(pcommon.NewTimestampFromTime(time.Now())) + root.SetEndTimestamp(pcommon.NewTimestampFromTime(time.Now().Add(time.Second))) + root.Status().SetCode(ptrace.StatusCodeOk) + + spanID := uint64(2) + + // Handler spans (unique names, won't aggregate) + numHandlers := numSpans / 10 + handlerIDs := make([]pcommon.SpanID, 0, numHandlers) + for i := range numHandlers { + span := ss.Spans().AppendEmpty() + span.SetTraceID(testTraceID) + id := makeSpanID(spanID) + span.SetSpanID(id) + span.SetParentSpanID(root.SpanID()) + span.SetName(fmt.Sprintf("handler-%d", i)) + span.SetKind(ptrace.SpanKindInternal) + span.SetStartTimestamp(pcommon.NewTimestampFromTime(time.Now())) + span.SetEndTimestamp(pcommon.NewTimestampFromTime(time.Now().Add(100 * time.Millisecond))) + span.Status().SetCode(ptrace.StatusCodeOk) + handlerIDs = append(handlerIDs, id) + spanID++ + } + + // Unique leaf spans (won't aggregate) + numRepeated := minSpans * 2 + numUnique := numSpans - numHandlers - 1 - numRepeated + for i := 0; i < numUnique && spanID < uint64(numSpans+1); i++ { + span := ss.Spans().AppendEmpty() + span.SetTraceID(testTraceID) + span.SetSpanID(makeSpanID(spanID)) + span.SetParentSpanID(handlerIDs[i%len(handlerIDs)]) + span.SetName(fmt.Sprintf("unique-op-%d", i)) + span.SetKind(ptrace.SpanKindClient) + span.SetStartTimestamp(pcommon.NewTimestampFromTime(time.Now())) + span.SetEndTimestamp(pcommon.NewTimestampFromTime(time.Now().Add(10 * time.Millisecond))) + span.Status().SetCode(ptrace.StatusCodeOk) + span.Attributes().PutStr("db.system", "postgresql") + spanID++ + } + + // Repeated leaf spans (will aggregate) + if len(handlerIDs) > 0 { + targetHandler := handlerIDs[0] + for i := 0; i < numRepeated && spanID < uint64(numSpans+1); i++ { + span := ss.Spans().AppendEmpty() + span.SetTraceID(testTraceID) + span.SetSpanID(makeSpanID(spanID)) + span.SetParentSpanID(targetHandler) + span.SetName("SELECT") + span.SetKind(ptrace.SpanKindClient) + span.SetStartTimestamp(pcommon.NewTimestampFromTime(time.Now())) + span.SetEndTimestamp(pcommon.NewTimestampFromTime(time.Now().Add(10 * time.Millisecond))) + span.Status().SetCode(ptrace.StatusCodeOk) + span.Attributes().PutStr("db.system", "postgresql") + span.Attributes().PutStr("db.operation", "select") + spanID++ + } + } + + return td +} + +// generateDeepTrace creates a trace with specified depth and branching factor. +// Each level has spans with the same name, enabling parent aggregation. +func generateDeepTrace(depth, branchingFactor, leafsPerBranch, maxSpans int) ptrace.Traces { + td := ptrace.NewTraces() + ss := td.ResourceSpans().AppendEmpty().ScopeSpans().AppendEmpty() + + spanID := uint64(1) + totalSpans := 0 + + // Root span + root := ss.Spans().AppendEmpty() + root.SetTraceID(testTraceID) + root.SetSpanID(makeSpanID(spanID)) + root.SetName("root") + root.SetKind(ptrace.SpanKindServer) + root.SetStartTimestamp(pcommon.NewTimestampFromTime(time.Now())) + root.SetEndTimestamp(pcommon.NewTimestampFromTime(time.Now().Add(time.Second))) + root.Status().SetCode(ptrace.StatusCodeOk) + spanID++ + totalSpans++ + + // Build tree level by level + currentLevel := []pcommon.SpanID{root.SpanID()} + + for d := 1; d < depth && totalSpans < maxSpans; d++ { + nextLevel := make([]pcommon.SpanID, 0, len(currentLevel)*branchingFactor) + levelName := fmt.Sprintf("level-%d", d) + + for _, parentID := range currentLevel { + for b := 0; b < branchingFactor && totalSpans < maxSpans; b++ { + span := ss.Spans().AppendEmpty() + span.SetTraceID(testTraceID) + id := makeSpanID(spanID) + span.SetSpanID(id) + span.SetParentSpanID(parentID) + span.SetName(levelName) + span.SetKind(ptrace.SpanKindInternal) + span.SetStartTimestamp(pcommon.NewTimestampFromTime(time.Now())) + span.SetEndTimestamp(pcommon.NewTimestampFromTime(time.Now().Add(100 * time.Millisecond))) + span.Status().SetCode(ptrace.StatusCodeOk) + nextLevel = append(nextLevel, id) + spanID++ + totalSpans++ + } + } + currentLevel = nextLevel + } + + // Add leaf spans + for _, parentID := range currentLevel { + for l := 0; l < leafsPerBranch && totalSpans < maxSpans; l++ { + span := ss.Spans().AppendEmpty() + span.SetTraceID(testTraceID) + span.SetSpanID(makeSpanID(spanID)) + span.SetParentSpanID(parentID) + span.SetName("db-query") + span.SetKind(ptrace.SpanKindClient) + span.SetStartTimestamp(pcommon.NewTimestampFromTime(time.Now())) + span.SetEndTimestamp(pcommon.NewTimestampFromTime(time.Now().Add(10 * time.Millisecond))) + span.Status().SetCode(ptrace.StatusCodeOk) + span.Attributes().PutStr("db.system", "postgresql") + span.Attributes().PutStr("db.operation", "select") + spanID++ + totalSpans++ + } + } + + return td +} + +// generateTestSpans extracts spanInfo slice from a trace for tree benchmarks. +func generateTestSpans(numSpans, leafSpansPerParent int) []spanInfo { + td := generateTestTrace(numSpans, leafSpansPerParent) + spans := make([]spanInfo, 0, numSpans) + + for i := 0; i < td.ResourceSpans().Len(); i++ { + for j := 0; j < td.ResourceSpans().At(i).ScopeSpans().Len(); j++ { + ss := td.ResourceSpans().At(i).ScopeSpans().At(j) + for k := 0; k < ss.Spans().Len(); k++ { + spans = append(spans, spanInfo{ + span: ss.Spans().At(k), + scopeSpans: ss, + }) + } + } + } + + return spans +} diff --git a/processor/spanpruningprocessor/config.go b/processor/spanpruningprocessor/config.go index a73cbcd322db8..44cf682588e57 100644 --- a/processor/spanpruningprocessor/config.go +++ b/processor/spanpruningprocessor/config.go @@ -3,9 +3,77 @@ package spanpruningprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor" -// Config holds the configuration for the SpanPruning processor. -type Config struct{} +import ( + "errors" + "fmt" + "strings" + + "github.com/gobwas/glob" + "go.opentelemetry.io/collector/component" +) + +// Config defines the configuration options for the span pruning processor +// and the rules used to identify and aggregate similar spans. +type Config struct { + // GroupByAttributes lists attribute patterns used to decide which leaf spans + // belong in the same aggregation group. Spans must share the span name and + // have identical values for every matched attribute to be grouped. Patterns + // accept glob syntax, for example: + // - "db.*" matches db.operation, db.name, db.statement, etc. + // - "http.request.*" matches http.request.method, http.request.header, etc. + // - "service" matches only the exact key "service" + // Examples: ["db.*", "http.method"], ["rpc.*"]. + GroupByAttributes []string `mapstructure:"group_by_attributes"` + + // MinSpansToAggregate is the minimum number of similar spans required before + // aggregation occurs. Groups smaller than this threshold are preserved. + // Default: 5 + MinSpansToAggregate int `mapstructure:"min_spans_to_aggregate"` + + // MaxParentDepth bounds how many ancestor levels above the aggregated leaves + // can also be aggregated. Use 0 to aggregate only leaves, -1 for unlimited + // depth, or a positive integer to cap traversal. + // Default: 1 + MaxParentDepth int `mapstructure:"max_parent_depth"` + + // AggregationAttributePrefix prefixes all aggregation-related attributes that + // are added to summary spans. + // Default: "aggregation." + AggregationAttributePrefix string `mapstructure:"aggregation_attribute_prefix"` +} + +var _ component.Config = (*Config)(nil) + +// Validate checks if the processor configuration is valid +func (cfg *Config) Validate() error { + if cfg.MinSpansToAggregate < 2 { + return errors.New("min_spans_to_aggregate must be at least 2") + } + + if cfg.MaxParentDepth < -1 { + return errors.New("max_parent_depth must be -1 (unlimited) or >= 0") + } + + // Validate AggregationAttributePrefix + prefix := strings.TrimSpace(cfg.AggregationAttributePrefix) + if prefix == "" { + return errors.New("aggregation_attribute_prefix cannot be empty") + } + if strings.ContainsAny(prefix, " \t\n\r") { + return errors.New("aggregation_attribute_prefix cannot contain whitespace") + } + + // Validate GroupByAttributes glob patterns + for i, pattern := range cfg.GroupByAttributes { + if strings.TrimSpace(pattern) == "" { + return fmt.Errorf("group_by_attributes[%d] cannot be empty", i) + } + // Try to compile the same way processor.go does to catch invalid syntax early + _, err := glob.Compile(pattern) + if err != nil { + return fmt.Errorf("invalid glob pattern at group_by_attributes[%d]: %q: %w", i, pattern, err) + } + } -func (*Config) Validate() error { return nil } diff --git a/processor/spanpruningprocessor/config_test.go b/processor/spanpruningprocessor/config_test.go new file mode 100644 index 0000000000000..c2abc8e816e13 --- /dev/null +++ b/processor/spanpruningprocessor/config_test.go @@ -0,0 +1,169 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package spanpruningprocessor + +import ( + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/confmap/confmaptest" + + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor/internal/metadata" +) + +func TestLoadConfig(t *testing.T) { + t.Parallel() + + tests := []struct { + id component.ID + expected *Config + errorMessage string + }{ + { + id: component.NewIDWithName(metadata.Type, ""), + expected: &Config{ + GroupByAttributes: []string{"db.operation"}, + MinSpansToAggregate: 5, + MaxParentDepth: 1, + AggregationAttributePrefix: "aggregation.", + }, + }, + { + id: component.NewIDWithName(metadata.Type, "custom"), + expected: &Config{ + GroupByAttributes: []string{"db.operation", "db.name"}, + MinSpansToAggregate: 3, + MaxParentDepth: 1, + AggregationAttributePrefix: "batch.", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.id.String(), func(t *testing.T) { + cm, err := confmaptest.LoadConf(filepath.Join("testdata", "config.yaml")) + require.NoError(t, err) + + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + + sub, err := cm.Sub(tt.id.String()) + require.NoError(t, err) + require.NoError(t, sub.Unmarshal(cfg)) + + oCfg := cfg.(*Config) + if tt.errorMessage != "" { + assert.EqualError(t, oCfg.Validate(), tt.errorMessage) + return + } + + assert.NoError(t, oCfg.Validate()) + assert.Equal(t, tt.expected, oCfg) + }) + } +} + +func TestConfig_Validate(t *testing.T) { + tests := []struct { + name string + config *Config + expectError bool + }{ + { + name: "valid config", + config: &Config{ + MinSpansToAggregate: 2, + AggregationAttributePrefix: "aggregation.", + GroupByAttributes: []string{"db.operation"}, + }, + expectError: false, + }, + { + name: "min_spans_to_aggregate below minimum", + config: &Config{ + MinSpansToAggregate: 1, + }, + expectError: true, + }, + { + name: "min_spans_to_aggregate zero", + config: &Config{ + MinSpansToAggregate: 0, + }, + expectError: true, + }, + { + name: "min_spans_to_aggregate negative", + config: &Config{ + MinSpansToAggregate: -1, + }, + expectError: true, + }, + { + name: "empty aggregation_attribute_prefix", + config: &Config{ + MinSpansToAggregate: 2, + AggregationAttributePrefix: "", + }, + expectError: true, + }, + { + name: "whitespace-only aggregation_attribute_prefix", + config: &Config{ + MinSpansToAggregate: 2, + AggregationAttributePrefix: " ", + }, + expectError: true, + }, + { + name: "empty group_by_attributes pattern", + config: &Config{ + MinSpansToAggregate: 2, + AggregationAttributePrefix: "aggregation.", + GroupByAttributes: []string{"db.operation", ""}, + }, + expectError: true, + }, + { + name: "whitespace-only group_by_attributes pattern", + config: &Config{ + MinSpansToAggregate: 2, + AggregationAttributePrefix: "aggregation.", + GroupByAttributes: []string{"db.operation", " "}, + }, + expectError: true, + }, + { + name: "invalid glob pattern in group_by_attributes", + config: &Config{ + MinSpansToAggregate: 2, + AggregationAttributePrefix: "aggregation.", + GroupByAttributes: []string{"db.operation", "[invalid*"}, + }, + expectError: true, + }, + { + name: "max_parent_depth unlimited", + config: &Config{ + MinSpansToAggregate: 2, + AggregationAttributePrefix: "aggregation.", + MaxParentDepth: -1, + }, + expectError: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.config.Validate() + if tt.expectError { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } + }) + } +} diff --git a/processor/spanpruningprocessor/doc.go b/processor/spanpruningprocessor/doc.go index e0997951e7b40..0e0fad05e1871 100644 --- a/processor/spanpruningprocessor/doc.go +++ b/processor/spanpruningprocessor/doc.go @@ -3,5 +3,9 @@ //go:generate make mdatagen -// Package spanpruningprocessor +// Package spanpruningprocessor detects duplicate or similar leaf spans within a +// single trace and replaces each set with a single aggregated summary span. +// Leaf spans are spans that are never referenced as a parent by another span. +// When all children of a parent are aggregated, the parent can also be +// aggregated, preserving the trace structure while reducing volume. package spanpruningprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor" diff --git a/processor/spanpruningprocessor/documentation.md b/processor/spanpruningprocessor/documentation.md new file mode 100644 index 0000000000000..98c0767023f5f --- /dev/null +++ b/processor/spanpruningprocessor/documentation.md @@ -0,0 +1,55 @@ +[comment]: <> (Code generated by mdatagen. DO NOT EDIT.) + +# spanpruning + +## Internal Telemetry + +The following telemetry is emitted by this component. + +### otelcol_processor_spanpruning_aggregation_group_size + +Distribution of spans per aggregation group + +| Unit | Metric Type | Value Type | Stability | +| ---- | ----------- | ---------- | --------- | +| {spans} | Histogram | Int | Development | + +### otelcol_processor_spanpruning_aggregations_created + +Total aggregation summary spans created + +| Unit | Metric Type | Value Type | Monotonic | Stability | +| ---- | ----------- | ---------- | --------- | --------- | +| {spans} | Sum | Int | true | Development | + +### otelcol_processor_spanpruning_processing_duration + +Time to process each batch of traces + +| Unit | Metric Type | Value Type | Stability | +| ---- | ----------- | ---------- | --------- | +| s | Histogram | Double | Development | + +### otelcol_processor_spanpruning_spans_pruned + +Total spans pruned/removed by aggregation + +| Unit | Metric Type | Value Type | Monotonic | Stability | +| ---- | ----------- | ---------- | --------- | --------- | +| {spans} | Sum | Int | true | Development | + +### otelcol_processor_spanpruning_spans_received + +Total spans received by the processor + +| Unit | Metric Type | Value Type | Monotonic | Stability | +| ---- | ----------- | ---------- | --------- | --------- | +| {spans} | Sum | Int | true | Development | + +### otelcol_processor_spanpruning_traces_processed + +Total traces processed + +| Unit | Metric Type | Value Type | Monotonic | Stability | +| ---- | ----------- | ---------- | --------- | --------- | +| {traces} | Sum | Int | true | Development | diff --git a/processor/spanpruningprocessor/factory.go b/processor/spanpruningprocessor/factory.go index eca7522b51bda..7075ca8cc51dd 100644 --- a/processor/spanpruningprocessor/factory.go +++ b/processor/spanpruningprocessor/factory.go @@ -21,12 +21,15 @@ func NewFactory() processor.Factory { return processor.NewFactory( metadata.Type, createDefaultConfig, - processor.WithTraces(createTracesProcessor, metadata.TracesStability), - ) + processor.WithTraces(createTracesProcessor, metadata.TracesStability)) } func createDefaultConfig() component.Config { - return &Config{} + return &Config{ + MinSpansToAggregate: 5, + MaxParentDepth: 1, + AggregationAttributePrefix: "aggregation.", + } } func createTracesProcessor( @@ -35,12 +38,24 @@ func createTracesProcessor( cfg component.Config, nextConsumer consumer.Traces, ) (processor.Traces, error) { + pCfg := cfg.(*Config) + + telemetryBuilder, err := metadata.NewTelemetryBuilder(set.TelemetrySettings) + if err != nil { + return nil, err + } + + p, err := newSpanPruningProcessor(set, pCfg, telemetryBuilder) + if err != nil { + return nil, err + } + return processorhelper.NewTraces( ctx, set, cfg, nextConsumer, - newSpanPruningProcessor().processTraces, + p.processTraces, processorhelper.WithCapabilities(processorCapabilities), - ) + processorhelper.WithShutdown(p.shutdown)) } diff --git a/processor/spanpruningprocessor/factory_test.go b/processor/spanpruningprocessor/factory_test.go new file mode 100644 index 0000000000000..0a18549eb4201 --- /dev/null +++ b/processor/spanpruningprocessor/factory_test.go @@ -0,0 +1,48 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package spanpruningprocessor + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component/componenttest" + "go.opentelemetry.io/collector/consumer/consumertest" + "go.opentelemetry.io/collector/processor/processortest" + + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor/internal/metadata" +) + +func TestFactory_Type(t *testing.T) { + factory := NewFactory() + assert.Equal(t, metadata.Type, factory.Type()) +} + +func TestFactory_CreateDefaultConfig(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + + assert.NotNil(t, cfg) + assert.NoError(t, componenttest.CheckConfigStruct(cfg)) + + oCfg := cfg.(*Config) + assert.Equal(t, 5, oCfg.MinSpansToAggregate) + assert.Equal(t, "aggregation.", oCfg.AggregationAttributePrefix) +} + +func TestFactory_CreateTracesProcessor(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + + tp, err := factory.CreateTraces( + t.Context(), + processortest.NewNopSettings(metadata.Type), + cfg, + consumertest.NewNop(), + ) + + require.NoError(t, err) + assert.NotNil(t, tp) +} diff --git a/processor/spanpruningprocessor/go.mod b/processor/spanpruningprocessor/go.mod index 7e3852d001151..6df20810c3f4a 100644 --- a/processor/spanpruningprocessor/go.mod +++ b/processor/spanpruningprocessor/go.mod @@ -3,6 +3,7 @@ module github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanp go 1.25.0 require ( + github.com/gobwas/glob v0.2.3 github.com/stretchr/testify v1.11.1 go.opentelemetry.io/collector/component v1.55.1-0.20260409104450-d686cf9058ce go.opentelemetry.io/collector/component/componenttest v0.149.1-0.20260409104450-d686cf9058ce @@ -13,7 +14,11 @@ require ( go.opentelemetry.io/collector/processor v1.55.1-0.20260409104450-d686cf9058ce go.opentelemetry.io/collector/processor/processorhelper v0.149.1-0.20260409104450-d686cf9058ce go.opentelemetry.io/collector/processor/processortest v0.149.1-0.20260409104450-d686cf9058ce + go.opentelemetry.io/otel/metric v1.43.0 + go.opentelemetry.io/otel/sdk/metric v1.43.0 + go.opentelemetry.io/otel/trace v1.43.0 go.uber.org/goleak v1.3.0 + go.uber.org/zap v1.27.1 ) require ( @@ -22,7 +27,6 @@ require ( github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-viper/mapstructure/v2 v2.5.0 // indirect - github.com/gobwas/glob v0.2.3 // indirect github.com/google/uuid v1.6.0 // indirect github.com/hashicorp/go-version v1.9.0 // indirect github.com/json-iterator/go v1.1.12 // indirect @@ -44,12 +48,8 @@ require ( go.opentelemetry.io/collector/pipeline v1.55.1-0.20260409104450-d686cf9058ce // indirect go.opentelemetry.io/collector/processor/xprocessor v0.149.1-0.20260409104450-d686cf9058ce // indirect go.opentelemetry.io/otel v1.43.0 // indirect - go.opentelemetry.io/otel/metric v1.43.0 // indirect go.opentelemetry.io/otel/sdk v1.43.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect - go.opentelemetry.io/otel/trace v1.43.0 // indirect go.uber.org/multierr v1.11.0 // indirect - go.uber.org/zap v1.27.1 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/sys v0.42.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/processor/spanpruningprocessor/grouping.go b/processor/spanpruningprocessor/grouping.go new file mode 100644 index 0000000000000..47834551d0036 --- /dev/null +++ b/processor/spanpruningprocessor/grouping.go @@ -0,0 +1,191 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package spanpruningprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor" + +import ( + "encoding/base64" + "sort" + "strconv" + "strings" + "sync" + + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/ptrace" +) + +// builderPool reduces allocations in the hot path by reusing string builders. +var builderPool = sync.Pool{ + New: func() any { + return &strings.Builder{} + }, +} + +// buildGroupKey assembles the grouping key for a span using its name, +// status, and configured attribute matches. A pooled builder minimizes +// allocations in this frequently executed path. +func (p *spanPruningProcessor) buildGroupKey(span ptrace.Span) string { + builder := builderPool.Get().(*strings.Builder) + builder.Reset() + defer builderPool.Put(builder) + + builder.WriteString(span.Name()) + + // Include span kind in grouping key + builder.WriteString("|kind=") + builder.WriteString(span.Kind().String()) + + // Include status code in grouping key + builder.WriteString("|status=") + builder.WriteString(span.Status().Code().String()) + + // Include TraceState for Consistent Probability Sampling (CPS) compatibility. + // Spans with different TraceState values (e.g., different sampling thresholds) + // represent different sampling populations and must not be aggregated together. + builder.WriteString("|ts=") + builder.WriteString(span.TraceState().AsRaw()) + + attrs := span.Attributes() + + // Collect all matching attribute key-value pairs + matchedAttrs := make(map[string]pcommon.Value) + attrs.Range(func(key string, value pcommon.Value) bool { + for _, pattern := range p.attributePatterns { + if pattern.glob.Match(key) { + matchedAttrs[key] = value + break // Only match each key once + } + } + return true + }) + + // Sort keys for consistent ordering in the group key + keys := make([]string, 0, len(matchedAttrs)) + for k := range matchedAttrs { + keys = append(keys, k) + } + sort.Strings(keys) + + // Build the group key with sorted attribute key-value pairs + for _, key := range keys { + builder.WriteString("|") + builder.WriteString(key) + builder.WriteString("=") + writeAttributeValueKey(builder, matchedAttrs[key]) + } + + return builder.String() +} + +func writeAttributeValueKey(builder *strings.Builder, value pcommon.Value) { + builder.WriteString(value.Type().String()) + builder.WriteString(":") + switch value.Type() { + case pcommon.ValueTypeEmpty: + return + case pcommon.ValueTypeStr: + builder.WriteString(value.Str()) + case pcommon.ValueTypeBool: + builder.WriteString(strconv.FormatBool(value.Bool())) + case pcommon.ValueTypeInt: + builder.WriteString(strconv.FormatInt(value.Int(), 10)) + case pcommon.ValueTypeDouble: + builder.WriteString(strconv.FormatFloat(value.Double(), 'g', -1, 64)) + case pcommon.ValueTypeBytes: + bytesValue := value.Bytes().AsRaw() + builder.WriteString(base64.StdEncoding.EncodeToString(bytesValue)) + case pcommon.ValueTypeMap: + writeAttributeMapKey(builder, value.Map()) + case pcommon.ValueTypeSlice: + writeAttributeSliceKey(builder, value.Slice()) + } +} + +func writeAttributeMapKey(builder *strings.Builder, value pcommon.Map) { + keys := make([]string, 0, value.Len()) + value.Range(func(key string, _ pcommon.Value) bool { + keys = append(keys, key) + return true + }) + sort.Strings(keys) + builder.WriteString("{") + for index, key := range keys { + if index > 0 { + builder.WriteString(",") + } + builder.WriteString(key) + builder.WriteString("=") + attrValue, _ := value.Get(key) + writeAttributeValueKey(builder, attrValue) + } + builder.WriteString("}") +} + +func writeAttributeSliceKey(builder *strings.Builder, value pcommon.Slice) { + builder.WriteString("[") + for index := 0; index < value.Len(); index++ { + if index > 0 { + builder.WriteString(",") + } + writeAttributeValueKey(builder, value.At(index)) + } + builder.WriteString("]") +} + +// buildParentGroupKey constructs a parent grouping key from name and status +// only; attributes are intentionally excluded for parent aggregation. +func (*spanPruningProcessor) buildParentGroupKey(span ptrace.Span) string { + builder := builderPool.Get().(*strings.Builder) + builder.Reset() + defer builderPool.Put(builder) + + builder.WriteString(span.Name()) + builder.WriteString("|kind=") + builder.WriteString(span.Kind().String()) + builder.WriteString("|status=") + builder.WriteString(span.Status().Code().String()) + // Include TraceState for CPS compatibility + builder.WriteString("|ts=") + builder.WriteString(span.TraceState().AsRaw()) + return builder.String() +} + +// buildLeafGroupKey derives a leaf grouping key that includes the parent's +// span name (if present) plus the standard grouping key, caching results per +// node to avoid recomputation. +func (p *spanPruningProcessor) buildLeafGroupKey(node *spanNode) string { + // Use cached group key if available + if node.groupKey != "" { + return node.groupKey + } + + builder := builderPool.Get().(*strings.Builder) + builder.Reset() + defer builderPool.Put(builder) + + // Include parent span name to separate groups by parent + if node.parent != nil { + builder.WriteString("parent=") + builder.WriteString(node.parent.span.Name()) + builder.WriteString("|") + } + + // Include regular group key (name + status + attributes) + builder.WriteString(p.buildGroupKey(node.span)) + + // Cache the key for future use + node.groupKey = builder.String() + return node.groupKey +} + +// groupLeafNodesByKey groups leaf nodes by their derived key so that spans +// with identical grouping characteristics can be aggregated together. +func (p *spanPruningProcessor) groupLeafNodesByKey(leafNodes []*spanNode) map[string][]*spanNode { + // Pre-size map based on expected number of groups (assume ~1/4 unique groups) + groups := make(map[string][]*spanNode, len(leafNodes)/4+1) + for _, node := range leafNodes { + key := p.buildLeafGroupKey(node) + groups[key] = append(groups[key], node) + } + return groups +} diff --git a/processor/spanpruningprocessor/internal/metadata/generated_status.go b/processor/spanpruningprocessor/internal/metadata/generated_status.go index 9766a88b115a6..71cf4cf138adb 100644 --- a/processor/spanpruningprocessor/internal/metadata/generated_status.go +++ b/processor/spanpruningprocessor/internal/metadata/generated_status.go @@ -12,5 +12,5 @@ var ( ) const ( - TracesStability = component.StabilityLevelDevelopment + TracesStability = component.StabilityLevelAlpha ) diff --git a/processor/spanpruningprocessor/internal/metadata/generated_telemetry.go b/processor/spanpruningprocessor/internal/metadata/generated_telemetry.go new file mode 100644 index 0000000000000..13f78beb99297 --- /dev/null +++ b/processor/spanpruningprocessor/internal/metadata/generated_telemetry.go @@ -0,0 +1,102 @@ +// Code generated by mdatagen. DO NOT EDIT. + +package metadata + +import ( + "errors" + "sync" + + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +func Meter(settings component.TelemetrySettings) metric.Meter { + return settings.MeterProvider.Meter("github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor") +} + +func Tracer(settings component.TelemetrySettings) trace.Tracer { + return settings.TracerProvider.Tracer("github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor") +} + +// TelemetryBuilder provides an interface for components to report telemetry +// as defined in metadata and user config. +type TelemetryBuilder struct { + meter metric.Meter + mu sync.Mutex + registrations []metric.Registration + ProcessorSpanpruningAggregationGroupSize metric.Int64Histogram + ProcessorSpanpruningAggregationsCreated metric.Int64Counter + ProcessorSpanpruningProcessingDuration metric.Float64Histogram + ProcessorSpanpruningSpansPruned metric.Int64Counter + ProcessorSpanpruningSpansReceived metric.Int64Counter + ProcessorSpanpruningTracesProcessed metric.Int64Counter +} + +// TelemetryBuilderOption applies changes to default builder. +type TelemetryBuilderOption interface { + apply(*TelemetryBuilder) +} + +type telemetryBuilderOptionFunc func(mb *TelemetryBuilder) + +func (tbof telemetryBuilderOptionFunc) apply(mb *TelemetryBuilder) { + tbof(mb) +} + +// Shutdown unregister all registered callbacks for async instruments. +func (builder *TelemetryBuilder) Shutdown() { + builder.mu.Lock() + defer builder.mu.Unlock() + for _, reg := range builder.registrations { + reg.Unregister() + } +} + +// NewTelemetryBuilder provides a struct with methods to update all internal telemetry +// for a component +func NewTelemetryBuilder(settings component.TelemetrySettings, options ...TelemetryBuilderOption) (*TelemetryBuilder, error) { + builder := TelemetryBuilder{} + for _, op := range options { + op.apply(&builder) + } + builder.meter = Meter(settings) + var err, errs error + builder.ProcessorSpanpruningAggregationGroupSize, err = builder.meter.Int64Histogram( + "otelcol_processor_spanpruning_aggregation_group_size", + metric.WithDescription("Distribution of spans per aggregation group [Development]"), + metric.WithUnit("{spans}"), + ) + errs = errors.Join(errs, err) + builder.ProcessorSpanpruningAggregationsCreated, err = builder.meter.Int64Counter( + "otelcol_processor_spanpruning_aggregations_created", + metric.WithDescription("Total aggregation summary spans created [Development]"), + metric.WithUnit("{spans}"), + ) + errs = errors.Join(errs, err) + builder.ProcessorSpanpruningProcessingDuration, err = builder.meter.Float64Histogram( + "otelcol_processor_spanpruning_processing_duration", + metric.WithDescription("Time to process each batch of traces [Development]"), + metric.WithUnit("s"), + ) + errs = errors.Join(errs, err) + builder.ProcessorSpanpruningSpansPruned, err = builder.meter.Int64Counter( + "otelcol_processor_spanpruning_spans_pruned", + metric.WithDescription("Total spans pruned/removed by aggregation [Development]"), + metric.WithUnit("{spans}"), + ) + errs = errors.Join(errs, err) + builder.ProcessorSpanpruningSpansReceived, err = builder.meter.Int64Counter( + "otelcol_processor_spanpruning_spans_received", + metric.WithDescription("Total spans received by the processor [Development]"), + metric.WithUnit("{spans}"), + ) + errs = errors.Join(errs, err) + builder.ProcessorSpanpruningTracesProcessed, err = builder.meter.Int64Counter( + "otelcol_processor_spanpruning_traces_processed", + metric.WithDescription("Total traces processed [Development]"), + metric.WithUnit("{traces}"), + ) + errs = errors.Join(errs, err) + return &builder, errs +} diff --git a/processor/spanpruningprocessor/internal/metadata/generated_telemetry_test.go b/processor/spanpruningprocessor/internal/metadata/generated_telemetry_test.go new file mode 100644 index 0000000000000..a5557ac81eba9 --- /dev/null +++ b/processor/spanpruningprocessor/internal/metadata/generated_telemetry_test.go @@ -0,0 +1,73 @@ +// Code generated by mdatagen. DO NOT EDIT. + +package metadata + +import ( + "testing" + + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/component/componenttest" + "go.opentelemetry.io/otel/metric" + embeddedmetric "go.opentelemetry.io/otel/metric/embedded" + noopmetric "go.opentelemetry.io/otel/metric/noop" + "go.opentelemetry.io/otel/trace" + embeddedtrace "go.opentelemetry.io/otel/trace/embedded" + nooptrace "go.opentelemetry.io/otel/trace/noop" +) + +type mockMeter struct { + noopmetric.Meter + name string +} +type mockMeterProvider struct { + embeddedmetric.MeterProvider +} + +func (m mockMeterProvider) Meter(name string, opts ...metric.MeterOption) metric.Meter { + return mockMeter{name: name} +} + +type mockTracer struct { + nooptrace.Tracer + name string +} + +type mockTracerProvider struct { + embeddedtrace.TracerProvider +} + +func (m mockTracerProvider) Tracer(name string, opts ...trace.TracerOption) trace.Tracer { + return mockTracer{name: name} +} + +func TestProviders(t *testing.T) { + set := component.TelemetrySettings{ + MeterProvider: mockMeterProvider{}, + TracerProvider: mockTracerProvider{}, + } + + meter := Meter(set) + if m, ok := meter.(mockMeter); ok { + require.Equal(t, "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor", m.name) + } else { + require.Fail(t, "returned Meter not mockMeter") + } + + tracer := Tracer(set) + if m, ok := tracer.(mockTracer); ok { + require.Equal(t, "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor", m.name) + } else { + require.Fail(t, "returned Meter not mockTracer") + } +} + +func TestNewTelemetryBuilder(t *testing.T) { + set := componenttest.NewNopTelemetrySettings() + applied := false + _, err := NewTelemetryBuilder(set, telemetryBuilderOptionFunc(func(b *TelemetryBuilder) { + applied = true + })) + require.NoError(t, err) + require.True(t, applied) +} diff --git a/processor/spanpruningprocessor/internal/metadatatest/generated_telemetrytest.go b/processor/spanpruningprocessor/internal/metadatatest/generated_telemetrytest.go new file mode 100644 index 0000000000000..1ab8e4cadc8ac --- /dev/null +++ b/processor/spanpruningprocessor/internal/metadatatest/generated_telemetrytest.go @@ -0,0 +1,116 @@ +// Code generated by mdatagen. DO NOT EDIT. + +package metadatatest + +import ( + "testing" + + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/component/componenttest" + "go.opentelemetry.io/collector/processor" + "go.opentelemetry.io/collector/processor/processortest" + "go.opentelemetry.io/otel/sdk/metric/metricdata" + "go.opentelemetry.io/otel/sdk/metric/metricdata/metricdatatest" +) + +func NewSettings(tt *componenttest.Telemetry) processor.Settings { + set := processortest.NewNopSettings(processortest.NopType) + set.ID = component.NewID(component.MustNewType("spanpruning")) + set.TelemetrySettings = tt.NewTelemetrySettings() + return set +} + +func AssertEqualProcessorSpanpruningAggregationGroupSize(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.HistogramDataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_processor_spanpruning_aggregation_group_size", + Description: "Distribution of spans per aggregation group [Development]", + Unit: "{spans}", + Data: metricdata.Histogram[int64]{ + Temporality: metricdata.CumulativeTemporality, + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_processor_spanpruning_aggregation_group_size") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + +func AssertEqualProcessorSpanpruningAggregationsCreated(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.DataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_processor_spanpruning_aggregations_created", + Description: "Total aggregation summary spans created [Development]", + Unit: "{spans}", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + IsMonotonic: true, + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_processor_spanpruning_aggregations_created") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + +func AssertEqualProcessorSpanpruningProcessingDuration(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.HistogramDataPoint[float64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_processor_spanpruning_processing_duration", + Description: "Time to process each batch of traces [Development]", + Unit: "s", + Data: metricdata.Histogram[float64]{ + Temporality: metricdata.CumulativeTemporality, + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_processor_spanpruning_processing_duration") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + +func AssertEqualProcessorSpanpruningSpansPruned(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.DataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_processor_spanpruning_spans_pruned", + Description: "Total spans pruned/removed by aggregation [Development]", + Unit: "{spans}", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + IsMonotonic: true, + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_processor_spanpruning_spans_pruned") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + +func AssertEqualProcessorSpanpruningSpansReceived(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.DataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_processor_spanpruning_spans_received", + Description: "Total spans received by the processor [Development]", + Unit: "{spans}", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + IsMonotonic: true, + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_processor_spanpruning_spans_received") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + +func AssertEqualProcessorSpanpruningTracesProcessed(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.DataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_processor_spanpruning_traces_processed", + Description: "Total traces processed [Development]", + Unit: "{traces}", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + IsMonotonic: true, + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_processor_spanpruning_traces_processed") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} diff --git a/processor/spanpruningprocessor/internal/metadatatest/generated_telemetrytest_test.go b/processor/spanpruningprocessor/internal/metadatatest/generated_telemetrytest_test.go new file mode 100644 index 0000000000000..378b4f8e7b55e --- /dev/null +++ b/processor/spanpruningprocessor/internal/metadatatest/generated_telemetrytest_test.go @@ -0,0 +1,48 @@ +// Code generated by mdatagen. DO NOT EDIT. + +package metadatatest + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component/componenttest" + "go.opentelemetry.io/otel/sdk/metric/metricdata" + "go.opentelemetry.io/otel/sdk/metric/metricdata/metricdatatest" + + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor/internal/metadata" +) + +func TestSetupTelemetry(t *testing.T) { + testTel := componenttest.NewTelemetry() + tb, err := metadata.NewTelemetryBuilder(testTel.NewTelemetrySettings()) + require.NoError(t, err) + defer tb.Shutdown() + tb.ProcessorSpanpruningAggregationGroupSize.Record(context.Background(), 1) + tb.ProcessorSpanpruningAggregationsCreated.Add(context.Background(), 1) + tb.ProcessorSpanpruningProcessingDuration.Record(context.Background(), 1) + tb.ProcessorSpanpruningSpansPruned.Add(context.Background(), 1) + tb.ProcessorSpanpruningSpansReceived.Add(context.Background(), 1) + tb.ProcessorSpanpruningTracesProcessed.Add(context.Background(), 1) + AssertEqualProcessorSpanpruningAggregationGroupSize(t, testTel, + []metricdata.HistogramDataPoint[int64]{{}}, metricdatatest.IgnoreValue(), + metricdatatest.IgnoreTimestamp()) + AssertEqualProcessorSpanpruningAggregationsCreated(t, testTel, + []metricdata.DataPoint[int64]{{Value: 1}}, + metricdatatest.IgnoreTimestamp()) + AssertEqualProcessorSpanpruningProcessingDuration(t, testTel, + []metricdata.HistogramDataPoint[float64]{{}}, metricdatatest.IgnoreValue(), + metricdatatest.IgnoreTimestamp()) + AssertEqualProcessorSpanpruningSpansPruned(t, testTel, + []metricdata.DataPoint[int64]{{Value: 1}}, + metricdatatest.IgnoreTimestamp()) + AssertEqualProcessorSpanpruningSpansReceived(t, testTel, + []metricdata.DataPoint[int64]{{Value: 1}}, + metricdatatest.IgnoreTimestamp()) + AssertEqualProcessorSpanpruningTracesProcessed(t, testTel, + []metricdata.DataPoint[int64]{{Value: 1}}, + metricdatatest.IgnoreTimestamp()) + + require.NoError(t, testTel.Shutdown(context.Background())) +} diff --git a/processor/spanpruningprocessor/metadata.yaml b/processor/spanpruningprocessor/metadata.yaml index c58f86526f4e2..2fddc5083d863 100644 --- a/processor/spanpruningprocessor/metadata.yaml +++ b/processor/spanpruningprocessor/metadata.yaml @@ -1,10 +1,69 @@ -display_name: Span Pruning Processor type: spanpruning status: class: processor stability: - development: [traces] - distributions: [] + alpha: [traces] + distributions: [contrib] codeowners: active: [portertech] + +telemetry: + metrics: + processor_spanpruning_aggregation_group_size: + enabled: true + description: Distribution of spans per aggregation group + unit: "{spans}" + stability: development + histogram: + value_type: int + + processor_spanpruning_aggregations_created: + enabled: true + description: Total aggregation summary spans created + unit: "{spans}" + stability: development + sum: + value_type: int + monotonic: true + + processor_spanpruning_processing_duration: + enabled: true + description: Time to process each batch of traces + unit: s + stability: development + histogram: + value_type: double + + processor_spanpruning_spans_pruned: + enabled: true + description: Total spans pruned/removed by aggregation + unit: "{spans}" + stability: development + sum: + value_type: int + monotonic: true + + processor_spanpruning_spans_received: + enabled: true + description: Total spans received by the processor + unit: "{spans}" + stability: development + sum: + value_type: int + monotonic: true + + processor_spanpruning_traces_processed: + enabled: true + description: Total traces processed + unit: "{traces}" + stability: development + sum: + value_type: int + monotonic: true + +tests: + config: + group_by_attributes: + - "db.operation" + min_spans_to_aggregate: 2 diff --git a/processor/spanpruningprocessor/processor.go b/processor/spanpruningprocessor/processor.go index 61d597f2c0076..dfbb16d68e7cd 100644 --- a/processor/spanpruningprocessor/processor.go +++ b/processor/spanpruningprocessor/processor.go @@ -5,16 +5,259 @@ package spanpruningprocessor // import "github.com/open-telemetry/opentelemetry- import ( "context" + "fmt" + "time" + "github.com/gobwas/glob" + "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/ptrace" + "go.opentelemetry.io/collector/processor" + "go.uber.org/zap" + + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor/internal/metadata" ) -type spanPruningProcessor struct{} +// spanInfo pairs a span with its ScopeSpans container for in-place edits. +type spanInfo struct { + span ptrace.Span + scopeSpans ptrace.ScopeSpans +} + +// attributePattern caches a compiled glob used for attribute key matching. +type attributePattern struct { + glob glob.Glob +} + +// spanPruningProcessor aggregates similar leaf spans (and eligible parents) +// according to configuration while emitting telemetry about pruning actions. +type spanPruningProcessor struct { + config *Config + logger *zap.Logger + attributePatterns []attributePattern + telemetryBuilder *metadata.TelemetryBuilder +} + +func newSpanPruningProcessor(set processor.Settings, cfg *Config, telemetryBuilder *metadata.TelemetryBuilder) (*spanPruningProcessor, error) { + // Compile glob patterns for group_by_attributes + patterns := make([]attributePattern, 0, len(cfg.GroupByAttributes)) + for _, pattern := range cfg.GroupByAttributes { + g, err := glob.Compile(pattern) + if err != nil { + return nil, fmt.Errorf("invalid glob pattern %q: %w", pattern, err) + } + patterns = append(patterns, attributePattern{ + glob: g, + }) + } + + return &spanPruningProcessor{ + config: cfg, + logger: set.Logger, + attributePatterns: patterns, + telemetryBuilder: telemetryBuilder, + }, nil +} -func newSpanPruningProcessor() *spanPruningProcessor { - return &spanPruningProcessor{} +// shutdown releases processor resources, including telemetry providers. +func (p *spanPruningProcessor) shutdown(_ context.Context) error { + p.telemetryBuilder.Shutdown() + return nil } -func (*spanPruningProcessor) processTraces(_ context.Context, td ptrace.Traces) (ptrace.Traces, error) { +// processTraces runs aggregation for each trace batch and records processor +// telemetry about received, pruned, and aggregated spans. +func (p *spanPruningProcessor) processTraces(ctx context.Context, td ptrace.Traces) (ptrace.Traces, error) { + start := time.Now() + + // Count incoming spans + totalSpans := int64(0) + for i := 0; i < td.ResourceSpans().Len(); i++ { + for j := 0; j < td.ResourceSpans().At(i).ScopeSpans().Len(); j++ { + totalSpans += int64(td.ResourceSpans().At(i).ScopeSpans().At(j).Spans().Len()) + } + } + p.telemetryBuilder.ProcessorSpanpruningSpansReceived.Add(ctx, totalSpans) + + // Group spans by TraceID + traceSpans := p.groupSpansByTraceID(td) + + // Process each trace independently + tracesProcessed := int64(0) + for _, spans := range traceSpans { + p.processTrace(ctx, spans) + tracesProcessed++ + } + + // Record telemetry only when actual work was done + if tracesProcessed > 0 { + p.telemetryBuilder.ProcessorSpanpruningTracesProcessed.Add(ctx, tracesProcessed) + p.telemetryBuilder.ProcessorSpanpruningProcessingDuration.Record(ctx, + time.Since(start).Seconds()) + } + return td, nil } + +// groupSpansByTraceID flattens incoming data into a TraceID-indexed map so +// each trace can be analyzed independently. +func (*spanPruningProcessor) groupSpansByTraceID(td ptrace.Traces) map[pcommon.TraceID][]spanInfo { + traceSpans := make(map[pcommon.TraceID][]spanInfo) + + rss := td.ResourceSpans() + for i := 0; i < rss.Len(); i++ { + rs := rss.At(i) + ilss := rs.ScopeSpans() + for j := 0; j < ilss.Len(); j++ { + ils := ilss.At(j) + spans := ils.Spans() + for k := 0; k < spans.Len(); k++ { + span := spans.At(k) + traceID := span.TraceID() + traceSpans[traceID] = append(traceSpans[traceID], spanInfo{ + span: span, + scopeSpans: ils, + }) + } + } + } + + return traceSpans +} + +// processTrace applies the pruning algorithm to a single trace: +// 1) analyze aggregation candidates bottom-up, 2) build a top-down execution +// plan, and 3) create summary spans while removing originals. +func (p *spanPruningProcessor) processTrace(ctx context.Context, spans []spanInfo) { + // Build trace tree + tree := p.buildTraceTree(spans) + if len(tree.nodeByID) == 0 { + return + } + + // Phase 1: Analyze aggregations (bottom-up) + aggregationGroups := p.analyzeAggregationsWithTree(tree) + if len(aggregationGroups) == 0 { + return + } + + // Phase 2: Build aggregation plan (order top-down) + plan := p.buildAggregationPlan(aggregationGroups) + + // Phase 3: Execute aggregations (top-down) and record pruned spans + prunedCount := p.executeAggregations(plan) + + // Record telemetry after aggregation is complete + p.telemetryBuilder.ProcessorSpanpruningSpansPruned.Add(ctx, int64(prunedCount)) + p.telemetryBuilder.ProcessorSpanpruningAggregationsCreated.Add(ctx, int64(len(plan.groups))) + for i := range plan.groups { + p.telemetryBuilder.ProcessorSpanpruningAggregationGroupSize.Record(ctx, int64(len(plan.groups[i].nodes))) + } +} + +// analyzeAggregationsWithTree performs Phase 1 using tree structure +// Uses markedForRemoval field on nodes instead of separate map for better performance +// Optimized to walk up from marked nodes instead of scanning all nodes +func (p *spanPruningProcessor) analyzeAggregationsWithTree(tree *traceTree) map[string]aggregationGroup { + // Step 1: Get pre-computed leaf nodes + leafNodes := tree.getLeaves() + if len(leafNodes) == 0 { + return nil + } + + // Step 2: Group similar leaf nodes + leafGroups := p.groupLeafNodesByKey(leafNodes) + + // Step 3: Filter groups meeting minimum threshold and mark nodes + // Pre-size based on expected number of groups + aggregationGroups := make(map[string]aggregationGroup, len(leafGroups)/2) + + // Track nodes marked in this round for candidate collection + var markedNodes []*spanNode + + for groupKey, nodes := range leafGroups { + if len(nodes) < p.config.MinSpansToAggregate { + continue + } + + // Find template from nodes + templateNode := findLongestDurationNode(nodes) + + aggregationGroups[groupKey] = aggregationGroup{ + nodes: nodes, + depth: 0, + templateNode: templateNode, + } + + // Mark spans for removal + for _, node := range nodes { + node.markedForRemoval = true + } + markedNodes = append(markedNodes, nodes...) + } + + if len(aggregationGroups) == 0 { + return nil + } + + // Step 4: Walk up the tree to find eligible parent spans recursively + // Respect MaxParentDepth: 0 = no parent aggregation, -1 = unlimited, >0 = limit + if p.config.MaxParentDepth == 0 { + return aggregationGroups + } + + // Collect initial parent candidates from marked leaf nodes + candidates := collectParentCandidates(markedNodes) + + depth := 1 + for len(candidates) > 0 { + // Check if we've reached the maximum parent depth limit + if p.config.MaxParentDepth > 0 && depth > p.config.MaxParentDepth { + break + } + + // Find eligible parents from candidates (walks up from marked nodes) + eligibleParents := p.findEligibleParentNodesFromCandidates(candidates) + if len(eligibleParents) == 0 { + break + } + + // Group parent candidates by name + status + parentGroups := make(map[string][]*spanNode) + for _, node := range eligibleParents { + parentKey := p.buildParentGroupKey(node.span) + parentGroups[parentKey] = append(parentGroups[parentKey], node) + } + + // Add parent groups (at least 2 parents to aggregate) + markedNodes = markedNodes[:0] // reset for this round + for parentKey, nodes := range parentGroups { + if len(nodes) < 2 { + continue + } + + // Find the template node (longest duration) for this group + templateNode := findLongestDurationNode(nodes) + + aggregationGroups[parentKey] = aggregationGroup{ + nodes: nodes, + depth: depth, + templateNode: templateNode, + } + // Mark parent nodes for removal + for _, node := range nodes { + node.markedForRemoval = true + } + markedNodes = append(markedNodes, nodes...) + } + + if len(markedNodes) == 0 { + break + } + + // Collect next round of candidates from newly marked nodes + candidates = collectParentCandidates(markedNodes) + depth++ + } + + return aggregationGroups +} diff --git a/processor/spanpruningprocessor/processor_benchmark_test.go b/processor/spanpruningprocessor/processor_benchmark_test.go new file mode 100644 index 0000000000000..4395f927ce26c --- /dev/null +++ b/processor/spanpruningprocessor/processor_benchmark_test.go @@ -0,0 +1,271 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package spanpruningprocessor + +import ( + "testing" + + "go.opentelemetry.io/collector/pdata/ptrace" + "go.opentelemetry.io/collector/processor/processortest" + + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor/internal/metadata" +) + +// BenchmarkProcessTrace_SmallTrace benchmarks processing a small trace (10 spans). +func BenchmarkProcessTrace_SmallTrace(b *testing.B) { + benchmarkProcessTrace(b, 10, 5) +} + +// BenchmarkProcessTrace_MediumTrace benchmarks processing a medium trace (100 spans). +func BenchmarkProcessTrace_MediumTrace(b *testing.B) { + benchmarkProcessTrace(b, 100, 20) +} + +// BenchmarkProcessTrace_LargeTrace benchmarks processing a large trace (1000 spans). +func BenchmarkProcessTrace_LargeTrace(b *testing.B) { + benchmarkProcessTrace(b, 1000, 50) +} + +// BenchmarkProcessTrace_SparseAggregation benchmarks sparse aggregation (~10% aggregate). +func BenchmarkProcessTrace_SparseAggregation(b *testing.B) { + benchmarkProcessTraceSparse(b, 1000, 5) +} + +// BenchmarkDeepTrace_Depth1 benchmarks deep trace with max_parent_depth=1. +func BenchmarkDeepTrace_Depth1(b *testing.B) { + benchmarkDeepTrace(b, 20, 3, 5, 1000, 1) +} + +// BenchmarkDeepTrace_Depth5 benchmarks deep trace with max_parent_depth=5. +func BenchmarkDeepTrace_Depth5(b *testing.B) { + benchmarkDeepTrace(b, 20, 3, 5, 1000, 5) +} + +// BenchmarkDeepTrace_Depth10 benchmarks deep trace with max_parent_depth=10. +func BenchmarkDeepTrace_Depth10(b *testing.B) { + benchmarkDeepTrace(b, 20, 3, 5, 1000, 10) +} + +// BenchmarkBuildTraceTree benchmarks tree construction. +func BenchmarkBuildTraceTree(b *testing.B) { + proc := newBenchmarkProcessor(b, 5) + spans := generateTestSpans(1000, 50) + + b.ResetTimer() + for b.Loop() { + _ = proc.buildTraceTree(spans) + } +} + +// BenchmarkGroupLeafNodes benchmarks leaf node grouping. +func BenchmarkGroupLeafNodes(b *testing.B) { + proc := newBenchmarkProcessor(b, 5) + spans := generateTestSpans(1000, 50) + tree := proc.buildTraceTree(spans) + leaves := tree.getLeaves() + + b.ResetTimer() + for b.Loop() { + for _, leaf := range leaves { + leaf.groupKey = "" + } + _ = proc.groupLeafNodesByKey(leaves) + } +} + +// BenchmarkFindEligibleParents benchmarks parent candidate discovery. +func BenchmarkFindEligibleParents(b *testing.B) { + proc := newBenchmarkProcessor(b, 5) + spans := generateTestSpans(1000, 50) + tree := proc.buildTraceTree(spans) + leaves := tree.getLeaves() + + for _, leaf := range leaves { + leaf.markedForRemoval = true + } + candidates := collectParentCandidates(leaves) + + b.ResetTimer() + for b.Loop() { + for _, c := range candidates { + c.markedForRemoval = false + } + _ = proc.findEligibleParentNodesFromCandidates(candidates) + } +} + +// BenchmarkBuildGroupKey benchmarks group key construction. +func BenchmarkBuildGroupKey(b *testing.B) { + proc := newBenchmarkProcessor(b, 1) + td := generateTestTrace(200, 5) + span := td.ResourceSpans().At(0).ScopeSpans().At(0).Spans().At(2) + + b.ResetTimer() + for b.Loop() { + _ = proc.buildGroupKey(span) + } +} + +// BenchmarkExecuteAggregations benchmarks the aggregation execution phase. +func BenchmarkExecuteAggregations(b *testing.B) { + proc := newBenchmarkProcessor(b, 1) + base := generateTestTrace(500, 5) + + b.ResetTimer() + for range b.N { + td := ptrace.NewTraces() + base.CopyTo(td) + + spans := spanInfosFromTraces(td) + tree := proc.buildTraceTree(spans) + leafGroups := proc.groupLeafNodesByKey(tree.getLeaves()) + + groups := make(map[string]aggregationGroup, len(leafGroups)) + for key, nodes := range leafGroups { + if len(nodes) >= proc.config.MinSpansToAggregate { + templateNode := findLongestDurationNode(nodes) + groups[key] = aggregationGroup{nodes: nodes, depth: 0, templateNode: templateNode} + } + } + + plan := proc.buildAggregationPlan(groups) + + b.StartTimer() + proc.executeAggregations(plan) + b.StopTimer() + } +} + +// newBenchmarkProcessor creates a processor configured for benchmarking. +func newBenchmarkProcessor(b *testing.B, maxParentDepth int) *spanPruningProcessor { + b.Helper() + + cfg := createDefaultConfig().(*Config) + cfg.GroupByAttributes = []string{"http.*", "db.*"} + cfg.MinSpansToAggregate = 5 + cfg.MaxParentDepth = maxParentDepth + + set := processortest.NewNopSettings(metadata.Type) + telemetryBuilder, err := metadata.NewTelemetryBuilder(set.TelemetrySettings) + if err != nil { + b.Fatal(err) + } + + proc, err := newSpanPruningProcessor(set, cfg, telemetryBuilder) + if err != nil { + b.Fatal(err) + } + return proc +} + +func spanInfosFromTraces(td ptrace.Traces) []spanInfo { + var spans []spanInfo + + rss := td.ResourceSpans() + for i := 0; i < rss.Len(); i++ { + ilss := rss.At(i).ScopeSpans() + for j := 0; j < ilss.Len(); j++ { + ss := ilss.At(j) + ssSpans := ss.Spans() + for k := 0; k < ssSpans.Len(); k++ { + spans = append(spans, spanInfo{ + span: ssSpans.At(k), + scopeSpans: ss, + }) + } + } + } + + return spans +} + +func benchmarkProcessTrace(b *testing.B, numSpans, minSpans int) { + cfg := createDefaultConfig().(*Config) + cfg.MinSpansToAggregate = minSpans + cfg.GroupByAttributes = []string{"http.*"} + + set := processortest.NewNopSettings(metadata.Type) + telemetryBuilder, err := metadata.NewTelemetryBuilder(set.TelemetrySettings) + if err != nil { + b.Fatal(err) + } + + proc, err := newSpanPruningProcessor(set, cfg, telemetryBuilder) + if err != nil { + b.Fatal(err) + } + + td := generateTestTrace(numSpans, minSpans) + + b.ResetTimer() + for b.Loop() { + cloned := ptrace.NewTraces() + td.CopyTo(cloned) + _, err := proc.processTraces(b.Context(), cloned) + if err != nil { + b.Fatal(err) + } + } +} + +func benchmarkProcessTraceSparse(b *testing.B, numSpans, minSpans int) { + cfg := createDefaultConfig().(*Config) + cfg.MinSpansToAggregate = minSpans + cfg.GroupByAttributes = []string{"db.*"} + cfg.MaxParentDepth = 3 + + set := processortest.NewNopSettings(metadata.Type) + telemetryBuilder, err := metadata.NewTelemetryBuilder(set.TelemetrySettings) + if err != nil { + b.Fatal(err) + } + + proc, err := newSpanPruningProcessor(set, cfg, telemetryBuilder) + if err != nil { + b.Fatal(err) + } + + td := generateSparseTrace(numSpans, minSpans) + + b.ResetTimer() + for b.Loop() { + cloned := ptrace.NewTraces() + td.CopyTo(cloned) + _, err := proc.processTraces(b.Context(), cloned) + if err != nil { + b.Fatal(err) + } + } +} + +func benchmarkDeepTrace(b *testing.B, depth, branchingFactor, leafsPerBranch, maxSpans, maxParentDepth int) { + cfg := createDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + cfg.GroupByAttributes = []string{"db.*"} + cfg.MaxParentDepth = maxParentDepth + + set := processortest.NewNopSettings(metadata.Type) + telemetryBuilder, err := metadata.NewTelemetryBuilder(set.TelemetrySettings) + if err != nil { + b.Fatal(err) + } + + proc, err := newSpanPruningProcessor(set, cfg, telemetryBuilder) + if err != nil { + b.Fatal(err) + } + + td := generateDeepTrace(depth, branchingFactor, leafsPerBranch, maxSpans) + b.ReportMetric(float64(td.ResourceSpans().At(0).ScopeSpans().At(0).Spans().Len()), "spans") + + b.ResetTimer() + for b.Loop() { + cloned := ptrace.NewTraces() + td.CopyTo(cloned) + _, err := proc.processTraces(b.Context(), cloned) + if err != nil { + b.Fatal(err) + } + } +} diff --git a/processor/spanpruningprocessor/processor_test.go b/processor/spanpruningprocessor/processor_test.go new file mode 100644 index 0000000000000..965cba150b483 --- /dev/null +++ b/processor/spanpruningprocessor/processor_test.go @@ -0,0 +1,1791 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package spanpruningprocessor + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/consumer/consumertest" + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/ptrace" + "go.opentelemetry.io/collector/processor/processortest" + + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor/internal/metadata" +) + +func TestNewTraces(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + require.NotNil(t, tp) +} + +func TestLeafSpanPruning_BasicAggregation(t *testing.T) { + // Test: 3 identical leaf spans should be aggregated into 1 summary span + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithLeafSpans(t, 3, "SELECT", map[string]string{"db.operation": "select"}) + originalSpanCount := countSpans(td) + assert.Equal(t, 4, originalSpanCount) // 1 parent + 3 leaf spans + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // After processing: should have 1 parent + 1 summary span + finalSpanCount := countSpans(td) + assert.Equal(t, 2, finalSpanCount) + + // Verify summary span exists with aggregation attributes + summarySpan, found := findSummarySpan(td) + require.True(t, found, "summary span should exist") + + // Check aggregation attributes + attrs := summarySpan.Attributes() + spanCount, exists := attrs.Get("aggregation.span_count") + assert.True(t, exists, "aggregation.span_count should exist") + assert.Equal(t, int64(3), spanCount.Int()) +} + +func TestLeafSpanPruning_BelowThreshold(t *testing.T) { + // Test: 1 leaf span with min_spans_to_aggregate=2 should not be aggregated + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithLeafSpans(t, 1, "SELECT", map[string]string{"db.operation": "select"}) + originalSpanCount := countSpans(td) + assert.Equal(t, 2, originalSpanCount) // 1 parent + 1 leaf span + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // Should remain unchanged + finalSpanCount := countSpans(td) + assert.Equal(t, 2, finalSpanCount) +} + +func TestLeafSpanPruning_MixedLeafAndNonLeaf(t *testing.T) { + // Test: only aggregate leaf spans, not spans with children + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + // Create trace: root -> intermediate -> 3 leaf spans + td := createTestTraceWithIntermediateSpan(t) + originalSpanCount := countSpans(td) + assert.Equal(t, 5, originalSpanCount) // 1 root + 1 intermediate + 3 leaf spans + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // After: 1 root + 1 intermediate + 1 summary + finalSpanCount := countSpans(td) + assert.Equal(t, 3, finalSpanCount) +} + +func TestLeafSpanPruning_DifferentGroups(t *testing.T) { + // Test: spans with different attributes should stay in separate groups + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + cfg.GroupByAttributes = []string{"db.operation"} + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + // Create trace with mixed operations: 3 SELECT + 2 INSERT + td := createTestTraceWithMixedOperations(t) + originalSpanCount := countSpans(td) + assert.Equal(t, 6, originalSpanCount) // 1 parent + 3 SELECT + 2 INSERT + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // After: 1 parent + 1 SELECT summary + 1 INSERT summary + finalSpanCount := countSpans(td) + assert.Equal(t, 3, finalSpanCount) +} + +func TestLeafSpanPruning_EmptyTrace(t *testing.T) { + // Test: empty trace should be handled gracefully + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := ptrace.NewTraces() + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + assert.Equal(t, 0, countSpans(td)) +} + +func TestLeafSpanPruning_SingleSpanTrace(t *testing.T) { + // Test: single span trace (root only) should not be modified + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createSingleSpanTrace(t) + originalSpanCount := countSpans(td) + assert.Equal(t, 1, originalSpanCount) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // Should remain unchanged + finalSpanCount := countSpans(td) + assert.Equal(t, 1, finalSpanCount) +} + +func TestLeafSpanPruning_StatusAggregation(t *testing.T) { + // Test: spans with different status codes should be in separate groups + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + // Create trace with 4 OK spans and 2 Error spans (same name) + td := createTestTraceWithMixedStatusSpans(t) + originalSpanCount := countSpans(td) + assert.Equal(t, 7, originalSpanCount) // 1 parent + 4 OK + 2 Error + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // After: 1 parent + 1 OK summary (4 spans) + 1 Error summary (2 spans) + finalSpanCount := countSpans(td) + assert.Equal(t, 3, finalSpanCount) + + // Verify we have both an OK summary and an Error summary + okSummary, found := findSpanByNameAndStatus(td, "SELECT", ptrace.StatusCodeOk) + require.True(t, found) + okCount, _ := okSummary.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(4), okCount.Int()) + + errorSummary, found := findSpanByNameAndStatus(td, "SELECT", ptrace.StatusCodeError) + require.True(t, found) + errorCount, _ := errorSummary.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(2), errorCount.Int()) +} + +func TestLeafSpanPruning_StatusBelowThreshold(t *testing.T) { + // Test: 1 OK span + 1 Error span should not aggregate (each group below threshold) + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithErrorSpan(t) // Creates 1 OK + 1 Error span + originalSpanCount := countSpans(td) + assert.Equal(t, 3, originalSpanCount) // 1 parent + 1 OK + 1 Error + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // Should remain unchanged - neither group meets threshold + finalSpanCount := countSpans(td) + assert.Equal(t, 3, finalSpanCount) +} + +func TestLeafSpanPruning_DurationStats(t *testing.T) { + // Test: verify duration statistics are calculated correctly + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + // Create spans with known durations: 100ns, 200ns, 300ns + td := createTestTraceWithKnownDurations(t, []int64{100, 200, 300}) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + summarySpan, found := findSummarySpan(td) + require.True(t, found) + + attrs := summarySpan.Attributes() + + minDuration, _ := attrs.Get("aggregation.duration_min_ns") + assert.Equal(t, int64(100), minDuration.Int()) + + maxDuration, _ := attrs.Get("aggregation.duration_max_ns") + assert.Equal(t, int64(300), maxDuration.Int()) + + avgDuration, _ := attrs.Get("aggregation.duration_avg_ns") + assert.Equal(t, int64(200), avgDuration.Int()) // (100+200+300)/3 = 200 + + totalDuration, _ := attrs.Get("aggregation.duration_total_ns") + assert.Equal(t, int64(600), totalDuration.Int()) +} + +func TestLeafSpanPruning_GroupByNonStringAttributes(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + cfg.GroupByAttributes = []string{"db.retries", "db.cached"} + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithNonStringAttributes(t) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + finalSpanCount := countSpans(td) + assert.Equal(t, 3, finalSpanCount) + + summaries := findAllSummarySpans(td) + assert.Len(t, summaries, 2) + + var foundRetriesOne bool + var foundRetriesTwo bool + for _, summary := range summaries { + retries, retriesExists := summary.Attributes().Get("db.retries") + require.True(t, retriesExists) + cached, cachedExists := summary.Attributes().Get("db.cached") + require.True(t, cachedExists) + count, _ := summary.Attributes().Get("aggregation.span_count") + if retries.Int() == 1 && cached.Bool() { + foundRetriesOne = true + assert.Equal(t, int64(2), count.Int()) + } + if retries.Int() == 2 && cached.Bool() { + foundRetriesTwo = true + assert.Equal(t, int64(2), count.Int()) + } + } + + assert.True(t, foundRetriesOne) + assert.True(t, foundRetriesTwo) +} + +func TestLeafSpanPruning_TemplateEventsAndLinksPreserved(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithTemplateEventsAndLinks(t) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + summarySpan, found := findSummarySpan(td) + require.True(t, found) + + events := summarySpan.Events() + require.Equal(t, 1, events.Len()) + assert.Equal(t, "template_event", events.At(0).Name()) + eventAttr, eventAttrExists := events.At(0).Attributes().Get("event.attr") + require.True(t, eventAttrExists) + assert.Equal(t, "value", eventAttr.Str()) + + links := summarySpan.Links() + require.Equal(t, 1, links.Len()) + linkAttr, linkAttrExists := links.At(0).Attributes().Get("link.kind") + require.True(t, linkAttrExists) + assert.Equal(t, "template", linkAttr.Str()) +} + +// Helper functions + +func createTestTraceWithLeafSpans(t *testing.T, numLeafSpans int, spanName string, attrs map[string]string) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Create parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // Create leaf spans + for i := range numLeafSpans { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName(spanName) + span.SetStartTimestamp(pcommon.Timestamp(1000000000 + int64(i)*100)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100 + int64(i)*100)) + for k, v := range attrs { + span.Attributes().PutStr(k, v) + } + } + + return td +} + +func createTestTraceWithNonStringAttributes(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + spanConfigs := []struct { + spanID pcommon.SpanID + retries int64 + cached bool + }{ + {pcommon.SpanID([8]byte{2, 0, 0, 0, 0, 0, 0, 0}), 1, true}, + {pcommon.SpanID([8]byte{2, 1, 0, 0, 0, 0, 0, 0}), 1, true}, + {pcommon.SpanID([8]byte{3, 0, 0, 0, 0, 0, 0, 0}), 2, true}, + {pcommon.SpanID([8]byte{3, 1, 0, 0, 0, 0, 0, 0}), 2, true}, + } + + for i, cfg := range spanConfigs { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(cfg.spanID) + span.SetParentSpanID(parentSpanID) + span.SetName("db_query") + span.Attributes().PutInt("db.retries", cfg.retries) + span.Attributes().PutBool("db.cached", cfg.cached) + span.SetStartTimestamp(pcommon.Timestamp(1000000000 + int64(i)*100)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100 + int64(i)*100)) + } + + return td +} + +func createTestTraceWithTemplateEventsAndLinks(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // Template span with longest duration + templateSpan := ss.Spans().AppendEmpty() + templateSpan.SetTraceID(traceID) + templateSpan.SetSpanID(pcommon.SpanID([8]byte{2, 0, 0, 0, 0, 0, 0, 0})) + templateSpan.SetParentSpanID(parentSpanID) + templateSpan.SetName("SELECT") + templateSpan.SetStartTimestamp(pcommon.Timestamp(1000000000)) + templateSpan.SetEndTimestamp(pcommon.Timestamp(1000000500)) + templateSpan.Attributes().PutStr("db.operation", "select") + + templateEvent := templateSpan.Events().AppendEmpty() + templateEvent.SetName("template_event") + templateEvent.Attributes().PutStr("event.attr", "value") + + templateLink := templateSpan.Links().AppendEmpty() + templateLink.SetTraceID(pcommon.TraceID([16]byte{9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9})) + templateLink.SetSpanID(pcommon.SpanID([8]byte{9, 9, 9, 9, 9, 9, 9, 9})) + templateLink.Attributes().PutStr("link.kind", "template") + + // Shorter span without events or links + otherSpan := ss.Spans().AppendEmpty() + otherSpan.SetTraceID(traceID) + otherSpan.SetSpanID(pcommon.SpanID([8]byte{2, 1, 0, 0, 0, 0, 0, 0})) + otherSpan.SetParentSpanID(parentSpanID) + otherSpan.SetName("SELECT") + otherSpan.SetStartTimestamp(pcommon.Timestamp(1000000000)) + otherSpan.SetEndTimestamp(pcommon.Timestamp(1000000100)) + otherSpan.Attributes().PutStr("db.operation", "select") + + return td +} + +func createTestTraceWithIntermediateSpan(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + rootSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + intermediateSpanID := pcommon.SpanID([8]byte{2, 0, 0, 0, 0, 0, 0, 0}) + + // Root span + rootSpan := ss.Spans().AppendEmpty() + rootSpan.SetTraceID(traceID) + rootSpan.SetSpanID(rootSpanID) + rootSpan.SetName("root") + + // Intermediate span (child of root, parent of leaves) + intermediateSpan := ss.Spans().AppendEmpty() + intermediateSpan.SetTraceID(traceID) + intermediateSpan.SetSpanID(intermediateSpanID) + intermediateSpan.SetParentSpanID(rootSpanID) + intermediateSpan.SetName("intermediate") + + // 3 leaf spans (children of intermediate) + for i := range 3 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{3, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(intermediateSpanID) + span.SetName("SELECT") + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + return td +} + +func createTestTraceWithMixedOperations(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // 3 SELECT spans + for i := range 3 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("db_query") + span.Attributes().PutStr("db.operation", "select") + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + // 2 INSERT spans + for i := range 2 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{3, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("db_query") + span.Attributes().PutStr("db.operation", "insert") + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + return td +} + +func createSingleSpanTrace(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + span := ss.Spans().AppendEmpty() + span.SetTraceID(pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16})) + span.SetSpanID(pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0})) + span.SetName("root") + + return td +} + +func createTestTraceWithErrorSpan(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // Leaf span with OK status + span1 := ss.Spans().AppendEmpty() + span1.SetTraceID(traceID) + span1.SetSpanID(pcommon.SpanID([8]byte{2, 0, 0, 0, 0, 0, 0, 0})) + span1.SetParentSpanID(parentSpanID) + span1.SetName("SELECT") + span1.Status().SetCode(ptrace.StatusCodeOk) + span1.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span1.SetEndTimestamp(pcommon.Timestamp(1000000100)) + + // Leaf span with Error status + span2 := ss.Spans().AppendEmpty() + span2.SetTraceID(traceID) + span2.SetSpanID(pcommon.SpanID([8]byte{2, 1, 0, 0, 0, 0, 0, 0})) + span2.SetParentSpanID(parentSpanID) + span2.SetName("SELECT") + span2.Status().SetCode(ptrace.StatusCodeError) + span2.Status().SetMessage("query failed") + span2.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span2.SetEndTimestamp(pcommon.Timestamp(1000000100)) + + return td +} + +func createTestTraceWithKnownDurations(t *testing.T, durationsNs []int64) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // Leaf spans with specific durations + baseTime := int64(1000000000) + for i, duration := range durationsNs { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("SELECT") + span.SetStartTimestamp(pcommon.Timestamp(baseTime)) + span.SetEndTimestamp(pcommon.Timestamp(baseTime + duration)) + } + + return td +} + +func countSpans(td ptrace.Traces) int { + count := 0 + rss := td.ResourceSpans() + for i := 0; i < rss.Len(); i++ { + ilss := rss.At(i).ScopeSpans() + for j := 0; j < ilss.Len(); j++ { + count += ilss.At(j).Spans().Len() + } + } + return count +} + +// findSummarySpan finds the first summary span (with is_summary attribute set to true) +func findSummarySpan(td ptrace.Traces) (ptrace.Span, bool) { + rss := td.ResourceSpans() + for i := 0; i < rss.Len(); i++ { + ilss := rss.At(i).ScopeSpans() + for j := 0; j < ilss.Len(); j++ { + spans := ilss.At(j).Spans() + for k := 0; k < spans.Len(); k++ { + span := spans.At(k) + isSummary, exists := span.Attributes().Get("aggregation.is_summary") + if exists && isSummary.Bool() { + return span, true + } + } + } + } + return ptrace.Span{}, false +} + +func findSpanByNameAndStatus(td ptrace.Traces, spanName string, statusCode ptrace.StatusCode) (ptrace.Span, bool) { + // findSpanByNameAndStatus finds a summary span by exact name and status code + rss := td.ResourceSpans() + for i := 0; i < rss.Len(); i++ { + ilss := rss.At(i).ScopeSpans() + for j := 0; j < ilss.Len(); j++ { + spans := ilss.At(j).Spans() + for k := 0; k < spans.Len(); k++ { + span := spans.At(k) + isSummary, exists := span.Attributes().Get("aggregation.is_summary") + if exists && isSummary.Bool() && span.Name() == spanName && span.Status().Code() == statusCode { + return span, true + } + } + } + } + return ptrace.Span{}, false +} + +func createTestTraceWithMixedStatusSpans(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // 4 leaf spans with OK status + for i := range 4 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("SELECT") + span.Status().SetCode(ptrace.StatusCodeOk) + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + // 2 leaf spans with Error status + for i := range 2 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{3, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("SELECT") + span.Status().SetCode(ptrace.StatusCodeError) + span.Status().SetMessage("query failed") + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + return td +} + +// Glob pattern matching tests + +func TestLeafSpanPruning_GlobPatternWildcard(t *testing.T) { + // Test: "db.*" pattern matches db.operation, db.name, db.statement + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + cfg.GroupByAttributes = []string{"db.*"} + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + // Create trace with spans having multiple db.* attributes + td := createTestTraceWithMultipleDbAttrs(t) + originalSpanCount := countSpans(td) + assert.Equal(t, 4, originalSpanCount) // 1 parent + 3 leaf spans + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // All 3 leaf spans have same db.* values, should aggregate to 1 + finalSpanCount := countSpans(td) + assert.Equal(t, 2, finalSpanCount) // 1 parent + 1 summary + + summarySpan, found := findSummarySpan(td) + require.True(t, found) + + attrs := summarySpan.Attributes() + spanCount, _ := attrs.Get("aggregation.span_count") + assert.Equal(t, int64(3), spanCount.Int()) +} + +func TestLeafSpanPruning_GlobPatternSeparatesGroups(t *testing.T) { + // Test: spans with different db.* values should be in separate groups + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + cfg.GroupByAttributes = []string{"db.*"} + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + // Create trace with spans having different db.operation values + td := createTestTraceWithDifferentDbOperations(t) + originalSpanCount := countSpans(td) + assert.Equal(t, 5, originalSpanCount) // 1 parent + 2 select + 2 insert + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // 2 select spans -> 1 summary, 2 insert spans -> 1 summary + finalSpanCount := countSpans(td) + assert.Equal(t, 3, finalSpanCount) // 1 parent + 2 summaries +} + +func TestLeafSpanPruning_GlobPatternMultiplePatterns(t *testing.T) { + // Test: multiple glob patterns ["db.*", "http.*"] + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + cfg.GroupByAttributes = []string{"db.*", "http.*"} + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithDbAndHTTPAttrs(t) + originalSpanCount := countSpans(td) + assert.Equal(t, 4, originalSpanCount) // 1 parent + 3 leaf spans + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // All spans have same db.* and http.* values, should aggregate + finalSpanCount := countSpans(td) + assert.Equal(t, 2, finalSpanCount) +} + +func TestLeafSpanPruning_GlobPatternExactMatch(t *testing.T) { + // Test: pattern without wildcard "db.operation" matches exactly + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + cfg.GroupByAttributes = []string{"db.operation"} + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithMultipleDbAttrs(t) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // Should still group by db.operation exactly + finalSpanCount := countSpans(td) + assert.Equal(t, 2, finalSpanCount) +} + +func TestLeafSpanPruning_InvalidGlobPattern(t *testing.T) { + // Test: invalid glob pattern should return error during creation + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.GroupByAttributes = []string{"[invalid"} + + _, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid glob pattern") +} + +// Helper functions for glob pattern tests + +func createTestTraceWithMultipleDbAttrs(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // 3 leaf spans with identical db.* attributes + for i := range 3 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("db_query") + span.Attributes().PutStr("db.operation", "select") + span.Attributes().PutStr("db.name", "users") + span.Attributes().PutStr("db.system", "postgresql") + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + return td +} + +func createTestTraceWithDifferentDbOperations(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // 2 SELECT spans + for i := range 2 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("db_query") + span.Attributes().PutStr("db.operation", "select") + span.Attributes().PutStr("db.name", "users") + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + // 2 INSERT spans + for i := range 2 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{3, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("db_query") + span.Attributes().PutStr("db.operation", "insert") + span.Attributes().PutStr("db.name", "users") + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + return td +} + +func createTestTraceWithDbAndHTTPAttrs(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // 3 leaf spans with both db.* and http.* attributes + for i := range 3 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("api_call") + span.Attributes().PutStr("db.operation", "select") + span.Attributes().PutStr("db.name", "users") + span.Attributes().PutStr("http.method", "GET") + span.Attributes().PutStr("http.route", "/api/users") + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + return td +} + +// TestLeafSpanPruning_RecursiveParentAggregation tests that parent spans are aggregated +// when all their children are aggregated +func TestLeafSpanPruning_RecursiveParentAggregation(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + cfg.GroupByAttributes = []string{"db.op"} + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + // Create the complex trace from the plan example + td := createTestTraceWithRecursiveAggregation(t) + + // Before: 1 root + 3 OK handlers + 3 OK SELECTs + 2 Error handlers + 2 Error SELECTs + 1 OK handler + 1 INSERT + 1 worker + 1 SELECT = 15 spans + originalSpanCount := countSpans(td) + assert.Equal(t, 15, originalSpanCount) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // After: 1 root + 1 OK handler_aggregated + 1 OK SELECT_aggregated + 1 Error handler_aggregated + 1 Error SELECT_aggregated + 1 OK handler + 1 INSERT + 1 worker + 1 SELECT = 9 spans + finalSpanCount := countSpans(td) + assert.Equal(t, 9, finalSpanCount) + + // Verify aggregated spans exist + handlerOKAgg, found := findSpanByName(td, "handler", "Ok") + require.True(t, found, "OK handler summary should exist") + + handlerErrorAgg, found := findSpanByName(td, "handler", "Error") + require.True(t, found, "Error handler summary should exist") + + selectOKAgg, found := findSpanByName(td, "SELECT", "Ok") + require.True(t, found, "OK SELECT summary should exist") + + selectErrorAgg, found := findSpanByName(td, "SELECT", "Error") + require.True(t, found, "Error SELECT summary should exist") + + // Verify span counts + handlerOKCount, _ := handlerOKAgg.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(3), handlerOKCount.Int()) + + handlerErrorCount, _ := handlerErrorAgg.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(2), handlerErrorCount.Int()) + + selectOKCount, _ := selectOKAgg.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(3), selectOKCount.Int()) + + selectErrorCount, _ := selectErrorAgg.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(2), selectErrorCount.Int()) + + // Verify parent-child relationships + // SELECT_aggregated (OK) should be child of handler_aggregated (OK) + assert.Equal(t, handlerOKAgg.SpanID(), selectOKAgg.ParentSpanID()) + + // SELECT_aggregated (Error) should be child of handler_aggregated (Error) + assert.Equal(t, handlerErrorAgg.SpanID(), selectErrorAgg.ParentSpanID()) + + // Verify non-aggregated spans still exist + foundInsert := findSpanByExactName(td, "INSERT") + require.True(t, foundInsert, "INSERT span should still exist") + + foundWorker := findSpanByExactName(td, "worker") + require.True(t, foundWorker, "worker span should still exist") +} + +// TestLeafSpanPruning_ParentNotAggregatedIfChildrenMixed tests that parents are not +// aggregated if some children are aggregated but others are not +func TestLeafSpanPruning_ParentNotAggregatedIfChildrenMixed(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithMixedChildren(t) + + // Before: 1 root + 2 handlers + 3 SELECTs + 1 INSERT = 7 spans + originalSpanCount := countSpans(td) + assert.Equal(t, 7, originalSpanCount) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // After: 1 root + 2 handlers + 1 SELECT_aggregated + 1 INSERT = 5 spans + // Handlers should NOT be aggregated because one has a non-aggregated child (INSERT) + finalSpanCount := countSpans(td) + assert.Equal(t, 5, finalSpanCount) + + // Verify handler_aggregated does NOT exist + _, found := findSummarySpanByName(td, "handler") + assert.False(t, found, "handler summary should NOT exist") + + // Verify SELECT_aggregated exists + selectAgg, found := findSummarySpanByName(td, "SELECT") + require.True(t, found, "SELECT summary should exist") + + selectCount, _ := selectAgg.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(3), selectCount.Int()) + + // Verify original handler spans still exist + handlers := findAllSpansByExactName(td, "handler") + assert.Len(t, handlers, 2, "both handler spans should still exist") +} + +// TestLeafSpanPruning_RootSpansNotAggregated tests that root spans (with no parent) +// are never aggregated +func TestLeafSpanPruning_RootSpansNotAggregated(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithMultipleRoots(t) + + // Before: 3 root spans + 6 leaf spans (2 per root) = 9 spans + originalSpanCount := countSpans(td) + assert.Equal(t, 9, originalSpanCount) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // After: 3 root spans + 1 SELECT_aggregated = 4 spans + // Root spans should NOT be aggregated even though all children are aggregated + finalSpanCount := countSpans(td) + assert.Equal(t, 4, finalSpanCount) + + // Verify all root spans still exist + roots := findAllSpansByExactName(td, "root") + assert.Len(t, roots, 3, "all root spans should still exist") + + // Verify SELECT_aggregated exists + selectAgg, found := findSummarySpanByName(td, "SELECT") + require.True(t, found, "SELECT summary should exist") + + selectCount, _ := selectAgg.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(6), selectCount.Int()) +} + +// TestLeafSpanPruning_ThreeLevelAggregation tests aggregation across three levels +func TestLeafSpanPruning_ThreeLevelAggregation(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + cfg.MaxParentDepth = -1 // unlimited for this test + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithThreeLevels(t) + + // Before: 1 root + 2 middleware + 4 handlers (2 per middleware) + 8 SELECTs (2 per handler) = 15 spans + originalSpanCount := countSpans(td) + assert.Equal(t, 15, originalSpanCount) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // After: 1 root + 1 middleware_aggregated + 1 handler_aggregated + 1 SELECT_aggregated = 4 spans + finalSpanCount := countSpans(td) + assert.Equal(t, 4, finalSpanCount) + + // Verify all aggregated spans exist + middlewareAgg, found := findSummarySpanByName(td, "middleware") + require.True(t, found, "middleware summary should exist") + + handlerAgg, found := findSummarySpanByName(td, "handler") + require.True(t, found, "handler summary should exist") + + selectAgg, found := findSummarySpanByName(td, "SELECT") + require.True(t, found, "SELECT summary should exist") + + // Verify parent-child relationships + // handler summary should be child of middleware summary + assert.Equal(t, middlewareAgg.SpanID(), handlerAgg.ParentSpanID()) + + // SELECT summary should be child of handler summary + assert.Equal(t, handlerAgg.SpanID(), selectAgg.ParentSpanID()) + + // Verify span counts + middlewareCount, _ := middlewareAgg.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(2), middlewareCount.Int()) + + handlerCount, _ := handlerAgg.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(4), handlerCount.Int()) + + selectCount, _ := selectAgg.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(8), selectCount.Int()) +} + +// Helper functions for new tests + +func createTestTraceWithRecursiveAggregation(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + rootSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Root span + root := ss.Spans().AppendEmpty() + root.SetTraceID(traceID) + root.SetSpanID(rootSpanID) + root.SetName("root") + root.Status().SetCode(ptrace.StatusCodeOk) + + // 3x handler (OK) -> SELECT (OK, db.op=select) + for i := range 3 { + handlerID := pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0}) + handler := ss.Spans().AppendEmpty() + handler.SetTraceID(traceID) + handler.SetSpanID(handlerID) + handler.SetParentSpanID(rootSpanID) + handler.SetName("handler") + handler.Status().SetCode(ptrace.StatusCodeOk) + + selectSpan := ss.Spans().AppendEmpty() + selectSpan.SetTraceID(traceID) + selectSpan.SetSpanID(pcommon.SpanID([8]byte{3, byte(i), 0, 0, 0, 0, 0, 0})) + selectSpan.SetParentSpanID(handlerID) + selectSpan.SetName("SELECT") + selectSpan.Attributes().PutStr("db.op", "select") + selectSpan.Status().SetCode(ptrace.StatusCodeOk) + selectSpan.SetStartTimestamp(pcommon.Timestamp(1000000000)) + selectSpan.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + // 2x handler (Error) -> SELECT (Error, db.op=select) + for i := range 2 { + handlerID := pcommon.SpanID([8]byte{4, byte(i), 0, 0, 0, 0, 0, 0}) + handler := ss.Spans().AppendEmpty() + handler.SetTraceID(traceID) + handler.SetSpanID(handlerID) + handler.SetParentSpanID(rootSpanID) + handler.SetName("handler") + handler.Status().SetCode(ptrace.StatusCodeError) + + selectSpan := ss.Spans().AppendEmpty() + selectSpan.SetTraceID(traceID) + selectSpan.SetSpanID(pcommon.SpanID([8]byte{5, byte(i), 0, 0, 0, 0, 0, 0})) + selectSpan.SetParentSpanID(handlerID) + selectSpan.SetName("SELECT") + selectSpan.Attributes().PutStr("db.op", "select") + selectSpan.Status().SetCode(ptrace.StatusCodeError) + selectSpan.SetStartTimestamp(pcommon.Timestamp(1000000000)) + selectSpan.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + // 1x handler (OK) -> INSERT (OK, db.op=insert) - below threshold + handlerID := pcommon.SpanID([8]byte{6, 0, 0, 0, 0, 0, 0, 0}) + handler := ss.Spans().AppendEmpty() + handler.SetTraceID(traceID) + handler.SetSpanID(handlerID) + handler.SetParentSpanID(rootSpanID) + handler.SetName("handler") + handler.Status().SetCode(ptrace.StatusCodeOk) + + insertSpan := ss.Spans().AppendEmpty() + insertSpan.SetTraceID(traceID) + insertSpan.SetSpanID(pcommon.SpanID([8]byte{7, 0, 0, 0, 0, 0, 0, 0})) + insertSpan.SetParentSpanID(handlerID) + insertSpan.SetName("INSERT") + insertSpan.Attributes().PutStr("db.op", "insert") + insertSpan.Status().SetCode(ptrace.StatusCodeOk) + insertSpan.SetStartTimestamp(pcommon.Timestamp(1000000000)) + insertSpan.SetEndTimestamp(pcommon.Timestamp(1000000100)) + + // 1x worker (OK) -> SELECT (OK, db.op=select) - different parent name + workerID := pcommon.SpanID([8]byte{8, 0, 0, 0, 0, 0, 0, 0}) + worker := ss.Spans().AppendEmpty() + worker.SetTraceID(traceID) + worker.SetSpanID(workerID) + worker.SetParentSpanID(rootSpanID) + worker.SetName("worker") + worker.Status().SetCode(ptrace.StatusCodeOk) + + selectSpan := ss.Spans().AppendEmpty() + selectSpan.SetTraceID(traceID) + selectSpan.SetSpanID(pcommon.SpanID([8]byte{9, 0, 0, 0, 0, 0, 0, 0})) + selectSpan.SetParentSpanID(workerID) + selectSpan.SetName("SELECT") + selectSpan.Attributes().PutStr("db.op", "select") + selectSpan.Status().SetCode(ptrace.StatusCodeOk) + selectSpan.SetStartTimestamp(pcommon.Timestamp(1000000000)) + selectSpan.SetEndTimestamp(pcommon.Timestamp(1000000100)) + + return td +} + +func createTestTraceWithMixedChildren(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + rootSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Root span + root := ss.Spans().AppendEmpty() + root.SetTraceID(traceID) + root.SetSpanID(rootSpanID) + root.SetName("root") + + // Handler 1 with 3 SELECTs (will be aggregated) + handler1ID := pcommon.SpanID([8]byte{2, 0, 0, 0, 0, 0, 0, 0}) + handler1 := ss.Spans().AppendEmpty() + handler1.SetTraceID(traceID) + handler1.SetSpanID(handler1ID) + handler1.SetParentSpanID(rootSpanID) + handler1.SetName("handler") + handler1.Status().SetCode(ptrace.StatusCodeOk) + + for i := range 3 { + selectSpan := ss.Spans().AppendEmpty() + selectSpan.SetTraceID(traceID) + selectSpan.SetSpanID(pcommon.SpanID([8]byte{3, byte(i), 0, 0, 0, 0, 0, 0})) + selectSpan.SetParentSpanID(handler1ID) + selectSpan.SetName("SELECT") + selectSpan.Status().SetCode(ptrace.StatusCodeOk) + selectSpan.SetStartTimestamp(pcommon.Timestamp(1000000000)) + selectSpan.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + // Handler 2 with 1 INSERT (not aggregated - mixed children) + handler2ID := pcommon.SpanID([8]byte{4, 0, 0, 0, 0, 0, 0, 0}) + handler2 := ss.Spans().AppendEmpty() + handler2.SetTraceID(traceID) + handler2.SetSpanID(handler2ID) + handler2.SetParentSpanID(rootSpanID) + handler2.SetName("handler") + handler2.Status().SetCode(ptrace.StatusCodeOk) + + insertSpan := ss.Spans().AppendEmpty() + insertSpan.SetTraceID(traceID) + insertSpan.SetSpanID(pcommon.SpanID([8]byte{5, 0, 0, 0, 0, 0, 0, 0})) + insertSpan.SetParentSpanID(handler2ID) + insertSpan.SetName("INSERT") + insertSpan.Status().SetCode(ptrace.StatusCodeOk) + insertSpan.SetStartTimestamp(pcommon.Timestamp(1000000000)) + insertSpan.SetEndTimestamp(pcommon.Timestamp(1000000100)) + + return td +} + +func createTestTraceWithMultipleRoots(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + + // 3 root spans, each with 2 SELECT children + for i := range 3 { + rootID := pcommon.SpanID([8]byte{byte(i + 1), 0, 0, 0, 0, 0, 0, 0}) + root := ss.Spans().AppendEmpty() + root.SetTraceID(traceID) + root.SetSpanID(rootID) + root.SetName("root") + root.Status().SetCode(ptrace.StatusCodeOk) + + for j := range 2 { + selectSpan := ss.Spans().AppendEmpty() + selectSpan.SetTraceID(traceID) + selectSpan.SetSpanID(pcommon.SpanID([8]byte{byte(i + 4), byte(j), 0, 0, 0, 0, 0, 0})) + selectSpan.SetParentSpanID(rootID) + selectSpan.SetName("SELECT") + selectSpan.Status().SetCode(ptrace.StatusCodeOk) + selectSpan.SetStartTimestamp(pcommon.Timestamp(1000000000)) + selectSpan.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + } + + return td +} + +func createTestTraceWithThreeLevels(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + rootSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Root span + root := ss.Spans().AppendEmpty() + root.SetTraceID(traceID) + root.SetSpanID(rootSpanID) + root.SetName("root") + + spanIDCounter := byte(2) + + // 2 middleware spans + for range 2 { + middlewareID := pcommon.SpanID([8]byte{spanIDCounter, 0, 0, 0, 0, 0, 0, 0}) + spanIDCounter++ + + middleware := ss.Spans().AppendEmpty() + middleware.SetTraceID(traceID) + middleware.SetSpanID(middlewareID) + middleware.SetParentSpanID(rootSpanID) + middleware.SetName("middleware") + middleware.Status().SetCode(ptrace.StatusCodeOk) + + // Each middleware has 2 handler spans + for range 2 { + handlerID := pcommon.SpanID([8]byte{spanIDCounter, 0, 0, 0, 0, 0, 0, 0}) + spanIDCounter++ + + handler := ss.Spans().AppendEmpty() + handler.SetTraceID(traceID) + handler.SetSpanID(handlerID) + handler.SetParentSpanID(middlewareID) + handler.SetName("handler") + handler.Status().SetCode(ptrace.StatusCodeOk) + + // Each handler has 2 SELECT spans + for range 2 { + selectSpan := ss.Spans().AppendEmpty() + selectSpan.SetTraceID(traceID) + selectSpan.SetSpanID(pcommon.SpanID([8]byte{spanIDCounter, 0, 0, 0, 0, 0, 0, 0})) + spanIDCounter++ + selectSpan.SetParentSpanID(handlerID) + selectSpan.SetName("SELECT") + selectSpan.Status().SetCode(ptrace.StatusCodeOk) + selectSpan.SetStartTimestamp(pcommon.Timestamp(1000000000)) + selectSpan.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + } + } + + return td +} + +func findSpanByName(td ptrace.Traces, nameSubstring, statusCode string) (ptrace.Span, bool) { + // findSpanByName finds a summary span by name substring and status code string + rss := td.ResourceSpans() + for i := 0; i < rss.Len(); i++ { + rs := rss.At(i) + ilss := rs.ScopeSpans() + for j := 0; j < ilss.Len(); j++ { + ils := ilss.At(j) + spans := ils.Spans() + for k := 0; k < spans.Len(); k++ { + span := spans.At(k) + isSummary, exists := span.Attributes().Get("aggregation.is_summary") + if strings.Contains(span.Name(), nameSubstring) && span.Status().Code().String() == statusCode && exists && isSummary.Bool() { + return span, true + } + } + } + } + return ptrace.Span{}, false +} + +func findSpanByExactName(td ptrace.Traces, name string) bool { + rss := td.ResourceSpans() + for i := 0; i < rss.Len(); i++ { + rs := rss.At(i) + ilss := rs.ScopeSpans() + for j := 0; j < ilss.Len(); j++ { + ils := ilss.At(j) + spans := ils.Spans() + for k := 0; k < spans.Len(); k++ { + span := spans.At(k) + if span.Name() == name { + return true + } + } + } + } + return false +} + +// findSummarySpanByName finds a summary span (with is_summary attribute) by exact name +func findSummarySpanByName(td ptrace.Traces, name string) (ptrace.Span, bool) { + rss := td.ResourceSpans() + for i := 0; i < rss.Len(); i++ { + rs := rss.At(i) + ilss := rs.ScopeSpans() + for j := 0; j < ilss.Len(); j++ { + ils := ilss.At(j) + spans := ils.Spans() + for k := 0; k < spans.Len(); k++ { + span := spans.At(k) + isSummary, exists := span.Attributes().Get("aggregation.is_summary") + if span.Name() == name && exists && isSummary.Bool() { + return span, true + } + } + } + } + return ptrace.Span{}, false +} + +func findAllSpansByExactName(td ptrace.Traces, name string) []ptrace.Span { + var result []ptrace.Span + rss := td.ResourceSpans() + for i := 0; i < rss.Len(); i++ { + rs := rss.At(i) + ilss := rs.ScopeSpans() + for j := 0; j < ilss.Len(); j++ { + ils := ilss.At(j) + spans := ils.Spans() + for k := 0; k < spans.Len(); k++ { + span := spans.At(k) + if span.Name() == name { + result = append(result, span) + } + } + } + } + return result +} + +// TestLeafSpanPruning_LongestDurationTemplate tests that the span with the longest +// duration is used as the template for the summary span +func TestLeafSpanPruning_LongestDurationTemplate(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + cfg.GroupByAttributes = []string{"db.operation"} + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + // Create trace with spans of varying durations and unique identifying attributes + // The span with the longest duration should become the template + td := createTestTraceWithVaryingDurations(t) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // Find the summary span + summarySpan, found := findSummarySpan(td) + require.True(t, found, "summary span should exist") + + // Verify the template attribute from the longest-duration span is present + // The span with 500ns duration should be the template + identifier, exists := summarySpan.Attributes().Get("span.identifier") + require.True(t, exists, "span.identifier attribute should exist") + assert.Equal(t, "longest", identifier.Str(), "summary should use attributes from longest-duration span") + + // Verify duration stats + attrs := summarySpan.Attributes() + minDuration, _ := attrs.Get("aggregation.duration_min_ns") + assert.Equal(t, int64(100), minDuration.Int()) + + maxDuration, _ := attrs.Get("aggregation.duration_max_ns") + assert.Equal(t, int64(500), maxDuration.Int()) +} + +func createTestTraceWithVaryingDurations(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // Define spans with different durations and unique identifiers + // Duration order: 100ns, 500ns, 200ns - the 500ns span should be template + spanConfigs := []struct { + duration int64 + identifier string + }{ + {100, "short"}, + {500, "longest"}, // This one should be the template + {200, "medium"}, + } + + baseTime := int64(1000000000) + for i, cfg := range spanConfigs { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("db_query") + span.SetStartTimestamp(pcommon.Timestamp(baseTime)) + span.SetEndTimestamp(pcommon.Timestamp(baseTime + cfg.duration)) + span.Attributes().PutStr("db.operation", "select") // Grouping key - same for all + span.Attributes().PutStr("span.identifier", cfg.identifier) // Unique per span + } + + return td +} + +// TraceState grouping tests for Consistent Probability Sampling (CPS) compatibility + +func TestLeafSpanPruning_TraceStateGrouping_SameTraceState(t *testing.T) { + // Test: spans with identical TraceState should be aggregated together + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithSameTraceState(t, "ot=th:fd70a4;rv:12345") + originalSpanCount := countSpans(td) + assert.Equal(t, 4, originalSpanCount) // 1 parent + 3 leaf spans + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // All 3 leaf spans have same TraceState, should aggregate to 1 + finalSpanCount := countSpans(td) + assert.Equal(t, 2, finalSpanCount) // 1 parent + 1 summary + + summarySpan, found := findSummarySpan(td) + require.True(t, found) + + attrs := summarySpan.Attributes() + spanCount, _ := attrs.Get("aggregation.span_count") + assert.Equal(t, int64(3), spanCount.Int()) + + // Verify TraceState is preserved in summary span + assert.Equal(t, "ot=th:fd70a4;rv:12345", summarySpan.TraceState().AsRaw()) +} + +func TestLeafSpanPruning_TraceStateGrouping_DifferentThresholds(t *testing.T) { + // Test: spans with different th (threshold) values should be in separate groups + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithDifferentTraceStates(t) + originalSpanCount := countSpans(td) + assert.Equal(t, 6, originalSpanCount) // 1 parent + 3 spans (th:fd70a4) + 2 spans (th:fa00) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // 3 spans with th:fd70a4 -> 1 summary + // 2 spans with th:fa00 -> 1 summary + finalSpanCount := countSpans(td) + assert.Equal(t, 3, finalSpanCount) // 1 parent + 2 summaries + + // Verify we have two summary spans with different TraceState values + summaries := findAllSummarySpans(td) + assert.Len(t, summaries, 2, "should have 2 summary spans") + + // Collect TraceState values from summaries + traceStates := make(map[string]int64) + for _, summary := range summaries { + ts := summary.TraceState().AsRaw() + count, _ := summary.Attributes().Get("aggregation.span_count") + traceStates[ts] = count.Int() + } + + assert.Equal(t, int64(3), traceStates["ot=th:fd70a4;rv:12345"], "th:fd70a4 group should have 3 spans") + assert.Equal(t, int64(2), traceStates["ot=th:fa00;rv:12345"], "th:fa00 group should have 2 spans") +} + +func TestLeafSpanPruning_TraceStateGrouping_MixedWithEmpty(t *testing.T) { + // Test: spans with TraceState and spans without should be in separate groups + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithMixedTraceState(t) + originalSpanCount := countSpans(td) + assert.Equal(t, 6, originalSpanCount) // 1 parent + 3 spans (with TraceState) + 2 spans (empty) + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // 3 spans with TraceState -> 1 summary + // 2 spans with empty TraceState -> 1 summary + finalSpanCount := countSpans(td) + assert.Equal(t, 3, finalSpanCount) // 1 parent + 2 summaries + + summaries := findAllSummarySpans(td) + assert.Len(t, summaries, 2, "should have 2 summary spans") + + // Verify TraceState values are preserved correctly + var withTS, withoutTS ptrace.Span + for _, s := range summaries { + if s.TraceState().AsRaw() == "" { + withoutTS = s + } else { + withTS = s + } + } + + assert.Equal(t, "ot=th:fd70a4;rv:12345", withTS.TraceState().AsRaw()) + count1, _ := withTS.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(3), count1.Int()) + + assert.Empty(t, withoutTS.TraceState().AsRaw()) + count2, _ := withoutTS.Attributes().Get("aggregation.span_count") + assert.Equal(t, int64(2), count2.Int()) +} + +func TestLeafSpanPruning_TraceStateGrouping_EmptyTraceState(t *testing.T) { + // Test: spans with empty TraceState should be grouped together + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.MinSpansToAggregate = 2 + + tp, err := factory.CreateTraces(t.Context(), processortest.NewNopSettings(metadata.Type), cfg, consumertest.NewNop()) + require.NoError(t, err) + + td := createTestTraceWithSameTraceState(t, "") // Empty TraceState + originalSpanCount := countSpans(td) + assert.Equal(t, 4, originalSpanCount) // 1 parent + 3 leaf spans + + err = tp.ConsumeTraces(t.Context(), td) + require.NoError(t, err) + + // All 3 leaf spans have empty TraceState, should aggregate to 1 + finalSpanCount := countSpans(td) + assert.Equal(t, 2, finalSpanCount) + + summarySpan, found := findSummarySpan(td) + require.True(t, found) + assert.Empty(t, summarySpan.TraceState().AsRaw()) + + attrs := summarySpan.Attributes() + spanCount, _ := attrs.Get("aggregation.span_count") + assert.Equal(t, int64(3), spanCount.Int()) +} + +// Helper functions for TraceState tests + +func createTestTraceWithSameTraceState(t *testing.T, traceState string) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // 3 leaf spans with identical TraceState + for i := range 3 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("SELECT") + span.TraceState().FromRaw(traceState) + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + return td +} + +func createTestTraceWithDifferentTraceStates(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // 3 leaf spans with th:fd70a4 (1% sampling) + for i := range 3 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("SELECT") + span.TraceState().FromRaw("ot=th:fd70a4;rv:12345") + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + // 2 leaf spans with th:fa00 (2% sampling) + for i := range 2 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{3, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("SELECT") + span.TraceState().FromRaw("ot=th:fa00;rv:12345") + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + return td +} + +func createTestTraceWithMixedTraceState(t *testing.T) ptrace.Traces { + t.Helper() + td := ptrace.NewTraces() + rs := td.ResourceSpans().AppendEmpty() + ss := rs.ScopeSpans().AppendEmpty() + + traceID := pcommon.TraceID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + parentSpanID := pcommon.SpanID([8]byte{1, 0, 0, 0, 0, 0, 0, 0}) + + // Parent span + parentSpan := ss.Spans().AppendEmpty() + parentSpan.SetTraceID(traceID) + parentSpan.SetSpanID(parentSpanID) + parentSpan.SetName("parent") + + // 3 leaf spans with TraceState + for i := range 3 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{2, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("SELECT") + span.TraceState().FromRaw("ot=th:fd70a4;rv:12345") + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + // 2 leaf spans WITHOUT TraceState (empty) + for i := range 2 { + span := ss.Spans().AppendEmpty() + span.SetTraceID(traceID) + span.SetSpanID(pcommon.SpanID([8]byte{3, byte(i), 0, 0, 0, 0, 0, 0})) + span.SetParentSpanID(parentSpanID) + span.SetName("SELECT") + // No TraceState set (empty) + span.SetStartTimestamp(pcommon.Timestamp(1000000000)) + span.SetEndTimestamp(pcommon.Timestamp(1000000100)) + } + + return td +} + +func findAllSummarySpans(td ptrace.Traces) []ptrace.Span { + var result []ptrace.Span + rss := td.ResourceSpans() + for i := 0; i < rss.Len(); i++ { + ilss := rss.At(i).ScopeSpans() + for j := 0; j < ilss.Len(); j++ { + spans := ilss.At(j).Spans() + for k := 0; k < spans.Len(); k++ { + span := spans.At(k) + isSummary, exists := span.Attributes().Get("aggregation.is_summary") + if exists && isSummary.Bool() { + result = append(result, span) + } + } + } + } + return result +} diff --git a/processor/spanpruningprocessor/stats.go b/processor/spanpruningprocessor/stats.go new file mode 100644 index 0000000000000..851dc9ea6e208 --- /dev/null +++ b/processor/spanpruningprocessor/stats.go @@ -0,0 +1,67 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package spanpruningprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor" + +import ( + "time" + + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/ptrace" +) + +// aggregationData tracks statistics and time ranges for a group of spans in +// a single pass, replacing separate calculations for efficiency. +type aggregationData struct { + count int64 + minDuration time.Duration + maxDuration time.Duration + sumDuration time.Duration + earliestStart pcommon.Timestamp + latestEnd pcommon.Timestamp +} + +// calculateAggregationData derives span counts and duration stats for the +// provided nodes in one traversal. +func (*spanPruningProcessor) calculateAggregationData(nodes []*spanNode) aggregationData { + data := aggregationData{ + count: int64(len(nodes)), + } + + for i, node := range nodes { + span := node.span + data.updateWithSpan(span, i == 0) + } + + return data +} + +// updateWithSpan incorporates a single span into the aggregation statistics, +// tracking min/max durations and time ranges. +func (data *aggregationData) updateWithSpan(span ptrace.Span, isFirst bool) { + startTime := span.StartTimestamp().AsTime() + endTime := span.EndTimestamp().AsTime() + duration := endTime.Sub(startTime) + + // Calculate duration statistics + if isFirst { + data.minDuration = duration + data.maxDuration = duration + data.earliestStart = span.StartTimestamp() + data.latestEnd = span.EndTimestamp() + } else { + if duration < data.minDuration { + data.minDuration = duration + } + if duration > data.maxDuration { + data.maxDuration = duration + } + if span.StartTimestamp() < data.earliestStart { + data.earliestStart = span.StartTimestamp() + } + if span.EndTimestamp() > data.latestEnd { + data.latestEnd = span.EndTimestamp() + } + } + data.sumDuration += duration +} diff --git a/processor/spanpruningprocessor/testdata/config.yaml b/processor/spanpruningprocessor/testdata/config.yaml new file mode 100644 index 0000000000000..8109f4f24332b --- /dev/null +++ b/processor/spanpruningprocessor/testdata/config.yaml @@ -0,0 +1,12 @@ +spanpruning: + group_by_attributes: + - "db.operation" + min_spans_to_aggregate: 5 + aggregation_attribute_prefix: "aggregation." + +spanpruning/custom: + group_by_attributes: + - "db.operation" + - "db.name" + min_spans_to_aggregate: 3 + aggregation_attribute_prefix: "batch." diff --git a/processor/spanpruningprocessor/tree.go b/processor/spanpruningprocessor/tree.go new file mode 100644 index 0000000000000..7ed00ddc9747d --- /dev/null +++ b/processor/spanpruningprocessor/tree.go @@ -0,0 +1,172 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package spanpruningprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor" + +import ( + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/ptrace" + "go.uber.org/zap" +) + +// spanNode models a span in the trace tree with cached relationships and +// aggregation bookkeeping. +type spanNode struct { + span ptrace.Span + scopeSpans ptrace.ScopeSpans + parent *spanNode + children []*spanNode + groupKey string // cached group key for leaf spans + isLeaf bool // true if node has no children + markedForRemoval bool // true if node will be aggregated +} + +// traceTree holds span nodes indexed by ID plus quick leaf/orphan lists for +// efficient aggregation analysis. +type traceTree struct { + nodeByID map[pcommon.SpanID]*spanNode + leaves []*spanNode // nodes with no children, populated during build + orphans []*spanNode // spans whose parent is not in the trace +} + +// buildTraceTree constructs parent/child links for a trace and records +// leaves, roots, and orphans so aggregation decisions can account for +// incomplete traces. +func (p *spanPruningProcessor) buildTraceTree(spans []spanInfo) *traceTree { + tree := &traceTree{ + nodeByID: make(map[pcommon.SpanID]*spanNode, len(spans)), + } + + if len(spans) == 0 { + return tree + } + + // First pass: create nodes for all spans, initially mark all as leaves + for _, info := range spans { + node := &spanNode{ + span: info.span, + scopeSpans: info.scopeSpans, + isLeaf: true, // assume leaf until a child links to it + } + tree.nodeByID[info.span.SpanID()] = node + } + + // Second pass: link parent-child relationships and update leaf status + // Pre-allocate slices with reasonable capacity + tree.orphans = make([]*spanNode, 0, len(spans)/10) + var rootCount int + + for _, node := range tree.nodeByID { + parentID := node.span.ParentSpanID() + if parentID.IsEmpty() { + // This is a root span (no parent) + rootCount++ + } else if parent, exists := tree.nodeByID[parentID]; exists { + // Link to parent and mark parent as non-leaf + node.parent = parent + parent.isLeaf = false + if parent.children == nil { + parent.children = make([]*spanNode, 0, 4) + } + parent.children = append(parent.children, node) + } else { + // Parent not in trace - this is an orphan + tree.orphans = append(tree.orphans, node) + } + } + + // Third pass: collect leaves (nodes still marked as leaf) + tree.leaves = make([]*spanNode, 0, len(spans)/4) + for _, node := range tree.nodeByID { + if node.isLeaf { + tree.leaves = append(tree.leaves, node) + } + } + + // Log warnings for incomplete traces + if rootCount > 1 { + p.logger.Debug("multiple root spans found", + zap.Int("rootCount", rootCount)) + } else if rootCount == 0 && len(tree.orphans) > 0 { + p.logger.Debug("no root span found, trace may be incomplete") + } + + if len(tree.orphans) > 0 { + p.logger.Debug("orphaned spans detected", + zap.Int("orphanCount", len(tree.orphans))) + } + + return tree +} + +// getLeaves returns the pre-computed leaf nodes (spans with no children). +func (t *traceTree) getLeaves() []*spanNode { + return t.leaves +} + +// findEligibleParentNodesFromCandidates filters candidate parents to those +// whose children are all marked for aggregation and that are themselves +// aggregate-able. +func (p *spanPruningProcessor) findEligibleParentNodesFromCandidates(candidates []*spanNode) []*spanNode { + if len(candidates) == 0 { + return nil + } + + eligibleParents := make([]*spanNode, 0, len(candidates)/4) + for _, node := range candidates { + if p.isEligibleForParentAggregation(node) { + eligibleParents = append(eligibleParents, node) + } + } + return eligibleParents +} + +// collectParentCandidates returns unique parents of marked nodes for the +// next aggregation depth iteration. +func collectParentCandidates(markedNodes []*spanNode) []*spanNode { + if len(markedNodes) == 0 { + return nil + } + + seen := make(map[*spanNode]struct{}, len(markedNodes)/2) + candidates := make([]*spanNode, 0, len(markedNodes)/2) + + for _, node := range markedNodes { + if node.parent != nil { + if _, exists := seen[node.parent]; !exists { + seen[node.parent] = struct{}{} + candidates = append(candidates, node.parent) + } + } + } + + return candidates +} + +// isEligibleForParentAggregation verifies that a node meets the criteria for +// parent aggregation (not root, all children marked, not already marked). +func (*spanPruningProcessor) isEligibleForParentAggregation(node *spanNode) bool { + // Must have children (not a leaf) + if node.isLeaf { + return false + } + + // Must have a parent (not root) + if node.parent == nil { + return false + } + + // Must not already be marked for removal + if node.markedForRemoval { + return false + } + + // All children must be marked for removal + for _, child := range node.children { + if !child.markedForRemoval { + return false + } + } + + return true +} diff --git a/reports/distributions/contrib.yaml b/reports/distributions/contrib.yaml index 5369615076392..38599139b0079 100644 --- a/reports/distributions/contrib.yaml +++ b/reports/distributions/contrib.yaml @@ -121,6 +121,7 @@ components: - resource - resourcedetection - span + - spanpruning - sumologic - tail_sampling - transform