open-telemetry · portertech · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
@@ -215,6 +215,7 @@ components:
     - processor/scalewaydetector
     - processor/schema
     - processor/span
+    - processor/spanpruning
     - processor/sumologic
     - processor/tail_sampling
     - processor/transform

@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: new_component
+
+# The name of the component, or a single word describing the area of concern, (e.g. receiver/filelog)
+component: processor/spanpruning
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Add spanpruning processor for intelligent trace data reduction through span aggregation
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [3]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [user]
@@ -0,0 +1 @@
+include ../../Makefile.Common
@@ -0,0 +1,237 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+package spanpruningprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/spanpruningprocessor"
+
+import (
+	"encoding/binary"
+	"math/rand/v2"
+	"sort"
+	"time"
+
+	"go.opentelemetry.io/collector/pdata/pcommon"
+	"go.opentelemetry.io/collector/pdata/ptrace"
+)
+
+// aggregationGroup captures the spans to aggregate along with execution
+// metadata (tree depth, preassigned summary ID, and attribute loss info).
+type aggregationGroup struct {
+	nodes             []*spanNode            // nodes to aggregate (replaces []spanInfo for efficiency)
+	depth             int                    // tree depth (0 = leaf, 1 = parent of leaf, etc.)
+	summarySpanID     pcommon.SpanID         // SpanID of the summary span (assigned before creation)
+	lossInfo          attributeLossSummary   // attribute loss info (diverse + missing)
+	templateNode      *spanNode              // node to use as summary template (longest duration)
+	outlierAnalysis   *outlierAnalysisResult // IQR analysis results
+	preservedOutliers []*spanNode            // outliers to keep as individual spans
+}
+
+// aggregationPlan orders aggregation groups for top-down execution and
+// carries precomputed summary span IDs.
+type aggregationPlan struct {
+	groups []aggregationGroup
+}
+
+// findLongestDurationNode returns the node with the longest duration.
+func findLongestDurationNode(nodes []*spanNode) *spanNode {
+	if len(nodes) == 0 {
+		return nil
+	}
+	longest := nodes[0]
+	// pcommon.Timestamp is uint64 nanoseconds; direct subtraction avoids
+	// creating intermediate time.Time objects (2 per span otherwise).
+	longestDuration := int64(longest.span.EndTimestamp()) - int64(longest.span.StartTimestamp())
+	for _, node := range nodes[1:] {
+		duration := int64(node.span.EndTimestamp()) - int64(node.span.StartTimestamp())
+		if duration > longestDuration {
+			longest = node
+			longestDuration = duration
+		}
+	}
+	return longest
+}
+
+// generateSpanID produces a non-cryptographic span ID suitable for summary
+// spans; uniqueness is sufficient, not randomness strength.
+func generateSpanID() pcommon.SpanID {
+	var id [8]byte
+	binary.BigEndian.PutUint64(id[:], rand.Uint64())
+	return pcommon.SpanID(id)
+}
+
+// buildAggregationPlan sorts aggregation groups by depth (parents before
+// children) and preassigns summary SpanIDs to avoid conflicts during writes.
+func (*spanPruningProcessor) buildAggregationPlan(groups map[string]aggregationGroup) aggregationPlan {
+	// Convert map to slice with pre-allocation
+	groupSlice := make([]aggregationGroup, 0, len(groups))
+	for key := range groups {
+		groupSlice = append(groupSlice, groups[key])
+	}
+
+	// Sort by depth descending (highest depth first = top-down)
+	sort.Slice(groupSlice, func(i, j int) bool {
+		return groupSlice[i].depth > groupSlice[j].depth
+	})
+
+	// Pre-assign SpanIDs for all summary spans
+	for i := range groupSlice {
+		groupSlice[i].summarySpanID = generateSpanID()
+	}
+
+	return aggregationPlan{groups: groupSlice}
+}
+
+// executeAggregations performs the top-down creation of summary spans, batch
+// removes originals, and returns the number of pruned spans.
+func (p *spanPruningProcessor) executeAggregations(plan aggregationPlan) int {
+	// Track which parent SpanID should map to which summary SpanID
+	parentReplacements := make(map[pcommon.SpanID]pcommon.SpanID, len(plan.groups)*4)
+
+	// Track spans to remove per ScopeSpans for batch removal
+	spansToRemove := make(map[ptrace.ScopeSpans]map[pcommon.SpanID]struct{}, len(plan.groups))
+	prunedCount := 0
+
+	prefix := p.config.AggregationAttributePrefix
+
+	for i := range plan.groups {
+		group := &plan.groups[i]
+		// Calculate statistics and time range in single pass
+		data := p.calculateAggregationData(group.nodes)
+
+		// Determine the parent SpanID for the summary span
+		// Use the first node's parent as template
+		originalParentID := group.nodes[0].span.ParentSpanID()
+
+		// Check if the parent is being replaced by a summary span
+		summaryParentID := originalParentID
+		if replacementID, exists := parentReplacements[originalParentID]; exists {
+			summaryParentID = replacementID
+		}
+
+		// Create summary span with correct parent
+		p.createSummarySpanWithParent(*group, data, summaryParentID)
+
+		// Mark preserved outliers with reference to summary span
+		if len(group.preservedOutliers) > 0 {
+			for _, outlier := range group.preservedOutliers {
+				// Outliers become siblings of the summary span
+				outlier.span.SetParentSpanID(summaryParentID)
+				outlier.span.Attributes().PutBool(prefix+"is_preserved_outlier", true)
+				outlier.span.Attributes().PutStr(prefix+"summary_span_id",
+					group.summarySpanID.String())
+			}
+		}
+
+		// Record that these original span IDs should be replaced by the summary span ID
+		for _, node := range group.nodes {
+			spanID := node.span.SpanID()
+			parentReplacements[spanID] = group.summarySpanID
+			scopeSpans := node.scopeSpans
+			if spansToRemove[scopeSpans] == nil {
+				spansToRemove[scopeSpans] = make(map[pcommon.SpanID]struct{}, len(group.nodes))
+			}
+			spansToRemove[scopeSpans][spanID] = struct{}{}
+		}
+		prunedCount += len(group.nodes)
+	}
+
+	// Batch remove all marked spans in a single pass per ScopeSpans
+	for scopeSpans, spanIDs := range spansToRemove {
+		scopeSpans.Spans().RemoveIf(func(span ptrace.Span) bool {
+			_, shouldRemove := spanIDs[span.SpanID()]
+			return shouldRemove
+		})
+	}
+
+	return prunedCount
+}
+
+// createSummarySpanWithParent builds the summary span for an aggregation
+// group, wiring it under the provided parent SpanID and attaching stats
+// and attribute-loss annotations.
+func (p *spanPruningProcessor) createSummarySpanWithParent(group aggregationGroup, data aggregationData, parentSpanID pcommon.SpanID) ptrace.Span {
+	// Use the template node (longest duration span) as a template
+	templateNode := group.templateNode
+	templateSpan := templateNode.span
+	scopeSpans := templateNode.scopeSpans
+
+	// Create new span in the same ScopeSpans as the first span
+	newSpan := scopeSpans.Spans().AppendEmpty()
+
+	// Copy basic properties from template
+	newSpan.SetName(templateSpan.Name())
+	newSpan.SetTraceID(templateSpan.TraceID())
+	newSpan.SetSpanID(group.summarySpanID)
+	newSpan.SetParentSpanID(parentSpanID)
+	newSpan.SetKind(templateSpan.Kind())
+
+	// Set timestamps from aggregation data
+	newSpan.SetStartTimestamp(data.earliestStart)
+	newSpan.SetEndTimestamp(data.latestEnd)
+
+	// Copy attributes from template
+	templateSpan.Attributes().CopyTo(newSpan.Attributes())
+
+	// Copy status from template
+	templateSpan.Status().CopyTo(newSpan.Status())
+
+	// Copy TraceState from template for Consistent Probability Sampling compatibility
+	newSpan.TraceState().FromRaw(templateSpan.TraceState().AsRaw())
+
+	// Add aggregation statistics as attributes
+	prefix := p.config.AggregationAttributePrefix
+	newSpan.Attributes().PutBool(prefix+"is_summary", true)
+	newSpan.Attributes().PutInt(prefix+"span_count", data.count)
+	newSpan.Attributes().PutInt(prefix+"duration_min_ns", int64(data.minDuration))
+	newSpan.Attributes().PutInt(prefix+"duration_max_ns", int64(data.maxDuration))
+	newSpan.Attributes().PutInt(prefix+"duration_total_ns", int64(data.sumDuration))
+	if data.count > 0 {
+		newSpan.Attributes().PutInt(prefix+"duration_avg_ns", int64(data.sumDuration)/data.count)
+	}
+
+	// Add outlier analysis attributes when enabled
+	if group.outlierAnalysis != nil {
+		newSpan.Attributes().PutInt(prefix+"duration_median_ns", int64(group.outlierAnalysis.median))
+
+		if len(group.outlierAnalysis.correlations) > 0 {
+			newSpan.Attributes().PutStr(prefix+"outlier_correlated_attributes",
+				formatCorrelations(group.outlierAnalysis.correlations))
+		}
+
+		// Track preserved outliers
+		if len(group.preservedOutliers) > 0 {
+			newSpan.Attributes().PutInt(prefix+"preserved_outlier_count",
+				int64(len(group.preservedOutliers)))
+
+			// List preserved outlier span IDs
+			outlierIDs := newSpan.Attributes().PutEmptySlice(prefix + "preserved_outlier_span_ids")
+			for _, outlier := range group.preservedOutliers {
+				outlierIDs.AppendEmpty().SetStr(outlier.span.SpanID().String())
+			}
+		}
+	}
+
+	// Add histogram attributes if enabled
+	if len(p.config.AggregationHistogramBuckets) > 0 {
+		// Add bucket bounds (in seconds)
+		bucketBoundsSlice := newSpan.Attributes().PutEmptySlice(prefix + "histogram_bucket_bounds_s")
+		for _, bucket := range p.config.AggregationHistogramBuckets {
+			bucketBoundsSlice.AppendEmpty().SetDouble(float64(bucket) / float64(time.Second))
+		}
+
+		// Add bucket counts
+		bucketCountsSlice := newSpan.Attributes().PutEmptySlice(prefix + "histogram_bucket_counts")
+		for _, count := range data.bucketCounts {
+			bucketCountsSlice.AppendEmpty().SetInt(count)
+		}
+	}
+
+	// Add attribute loss info when detected
+	if len(group.lossInfo.diverse) > 0 {
+		newSpan.Attributes().PutStr(prefix+"diverse_attributes", formatAttributeCardinality(group.lossInfo.diverse))
+	}
+	if len(group.lossInfo.missing) > 0 {
+		newSpan.Attributes().PutStr(prefix+"missing_attributes", formatAttributeCardinality(group.lossInfo.missing))
+	}
+
+	return newSpan
+}