open-telemetry · jpkrohling · Jun 13, 2024 · May 12, 2023 · May 15, 2023 · May 16, 2023
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: enhancement
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: probabilisticsamplerprocessor
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Add Proportional and Equalizing sampling modes
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [31918]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext: Both the existing hash_seed mode and the two new modes use OTEP 235 semantic conventions to encode sampling probability.
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [user]
@@ -1,3 +1,4 @@
+
 # Probabilistic Sampling Processor
 
 <!-- status autogenerated section -->
@@ -115,7 +116,9 @@ interpreted as a percentage, with values >= 100 equal to 100%
 sampling.  The logs sampling priority attribute is configured via
 `sampling_priority`.
 
-## Sampling algorithm
+## Mode Selection
+
+There are three sampling modes available.  All modes are consistent.
 
 ### Hash seed
 
@@ -135,7 +138,154 @@ In order for hashing to be consistent, all collectors for a given tier
 at different collector tiers to support additional sampling
 requirements.
 
-This mode uses 14 bits of sampling precision.
+This mode uses 14 bits of information in its sampling decision; the
+default `sampling_precision`, which is 4 hexadecimal digits, exactly
+encodes this information.
+
+This mode is selected by default.
+
+#### Hash seed: Use-cases
+
+The hash seed mode is most useful in logs sampling, because it can be
+applied to units of telemetry other than TraceID.  For example, a
+deployment consisting of 100 pods can be sampled according to the
+`service.instance.id` resource attribute.  In this case, 10% sampling
+implies collecting log records from an expected value of 10 pods.
+
+### Proportional
+
+OpenTelemetry specifies a consistent sampling mechanism using 56 bits
+of randomness, which may be obtained from the Trace ID according to
+the W3C Trace Context Level 2 specification.  Randomness can also be
+explicly encoding in the OpenTelemetry `tracestate` field, where it is
+known as the R-value.
+
+This mode is named because it reduces the number of items transmitted
+proportionally, according to the sampling probability.  In this mode,
+items are selected for sampling without considering how much they were
+already sampled by preceding samplers.
+
+This mode uses 56 bits of information in its calculations.  The
+default `sampling_precision` (4) will cause thresholds to be rounded
+in some cases when they contain more than 16 significant bits.
+
+#### Proportional: Use-cases
+
+The proportional mode is generally applicable in trace sampling,
+because it is based on OpenTelemetry and W3C specifications.  This
+mode is selected by default, because it enforces a predictable
+(probabilistic) ratio between incoming items and outgoing items of
+telemetry.  No matter how SDKs and other sources of telemetry have
+been configured with respect to sampling, a collector configured with
+25% proportional sampling will output (an expected value of) 1 item
+for every 4 items input.
+
+### Equalizing
+
+This mode uses the same randomness mechanism as the propotional
-This mode uses the same randomness mechanism as the propotional
+This mode uses the same randomness mechanism as the proportional
-This mode uses the same randomness mechanism as the propotional
+This mode uses the same randomness mechanism as the proportional
+sampling mode, in this case considering how much each item was already
+sampled by preceding samplers.  This mode can be used to lower
+sampling probability to a minimum value across a whole pipeline, 
+making it possible to conditionally adjust sampling probabilities.
+
+This mode compares a 56 bit threshold against the configured sampling
+probability and updates when the threshold is larger.  The default
+`sampling_precision` (4) will cause updated thresholds to be rounded
+in some cases when they contain more than 16 significant bits.
+
+#### Equalizing: Use-cases
+
+The equalizing mode is useful in collector deployments where client
+SDKs have mixed sampling configuration and the user wants to apply a
+uniform sampling probability across the system.  For example, a user's
+system consists of mostly components developed in-house, but also some
+third-party software.  Seeking to lower the overall cost of tracing,
+the configures 10% sampling in the samplers for all of their in-house
+components.  This leaves third-party software components unsampled,
+making the savings less than desired.  In this case, the user could
+configure a 10% equalizing probabilistic sampler.  Already-sampled
+items of telemetry from the in-house components will pass-through one
+for one in this scenario, while items of telemetry from third-party
+software will be sampled by the intended amount.
+
+## Sampling threshold information
+
+In all modes, information about the effective sampling probability is
+added into the item of telemetry.  The random variable that was used
+may also be recorded, in case it was not derived from the TraceID
+using a standard algorithm.
+
+For traces, threshold and optional randomness information are encoded
+in the W3C Trace Context `tracestate` fields.  The tracestate is
+divided into sections according to a two-character vendor code;
+OpenTelemetry uses "ot" as its section designator.  Within the
+OpenTelemetry section, the sampling threshold is encoded using "th"
+and the optional random variable is encoded using "rv".
+
+For example, 25% sampling is encoded in a tracing Span as:
+
+```
+tracestate: ot=th:c
+```
+
+Users can randomness values in this way, independently, making it
+possible to apply consistent sampling across traces for example.  If
+the Trace was initialized with pre-determined randomness value
+`9b8233f7e3a151` and 100% sampling, it would read:
+
+```
+tracestate: ot=th:0;rv:9b8233f7e3a151
+```
+
+This component, using either proportional or equalizing modes, could
+apply 50% sampling the Span.  This span with randomness value
+`9b8233f7e3a151` is consistently sampled at 50% because the threshold,
+when zero padded (i.e., `80000000000000`), is less than the randomess
+value.  The resulting span will have the following tracestate:
+
+```
+tracestate: ot=th:8;rv:9b8233f7e3a151
+```
+
+For log records, threshold and randomness information are encoded in
+the log record itself, using attributes.  For example, 25% sampling
+with an explicit randomness value is encoded as:
+
+```
+sampling.threshold: c
+sampling.randomness: e05a99c8df8d32
+```
+
+### Sampling precision
+
+When encoding sampling probability in the form of a threshold,
+variable precision is permitted making it possible for the user to
+restrict sampling probabilities to rounded numbers of fixed width.
+
+Because the threshold is encoded using hexadecimal digits, each digit
+contributes 4 bits of information.  One digit of sampling precision
+can express exact sampling probabilities 1/16, 2/16, ... through
+16/16.  Two digits of sampling precision can express exact sampling
+probabilities 1/256, 2/256, ... through 256/256.  With N digits of
+sampling precision, there are exactly `(2^N)-1` exactly representable
+probabilities.
+
+Depending on the mode, there are different maximum reasonable settings
+for this parameter.
+
+- The `hash_seed` mode uses a 14-bit hash function, therefore
+  precision 4 completely captures the available information.
+- The `equalizing` mode configures a sampling probability after
+  parsing a `float32` value, which contains 20 bits of precision,
+  therefore precision 5 completely captures the available information.
+- The `proportional` mode configures its ratio using a `float32`
+  value, however it carries out the arithmetic using 56-bits of
+  precision.  In this mode, increasing precision has the effect
+  of preserving precision applied by preceding samplers.
+
+In cases where larger precision is configured than is actually
+available, the added precision has no effect because trailing zeros
+are eliminated by the encoding.
 
 ### Error handling
 
@@ -153,9 +303,11 @@ false, in which case erroneous data will pass through the processor.
 
 The following configuration options can be modified:
 
+- `mode` (string, optional): One of "proportional", "equalizing", or "hash_seed"; the default is "proportional" unless either `hash_seed` is configured or `attribute_source` is set to `record`.
 - `sampling_percentage` (32-bit floating point, required): Percentage at which items are sampled; >= 100 samples all items, 0 rejects all items.
 - `hash_seed` (32-bit unsigned integer, optional, default = 0): An integer used to compute the hash algorithm. Note that all collectors for a given tier (e.g. behind the same load balancer) should have the same hash_seed.
 - `fail_closed` (boolean, optional, default = true): Whether to reject items with sampling-related errors.
+- `sampling_precision` (integer, optional, default = 4): Determines the number of hexadecimal digits used to encode the sampling threshold.  Permitted values are 1..14.
 
 ### Logs-specific configuration
 

@@ -5,8 +5,11 @@ package probabilisticsamplerprocessor // import "github.com/open-telemetry/opent
 
 import (
 	"fmt"
+	"math"
 
 	"go.opentelemetry.io/collector/component"
+
+	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling"
 )
 
 type AttributeSource string
@@ -35,6 +38,33 @@ type Config struct {
 	// different sampling rates, configuring different seeds avoids that.
 	HashSeed uint32 `mapstructure:"hash_seed"`
 
+	// Mode selects the sampling behavior. Supported values:
+	//
+	// - "hash_seed": the legacy behavior of this processor.
+	//   Using an FNV hash combined with the HashSeed value, this
+	//   sampler performs a non-consistent probabilistic
+	//   downsampling.  The number of spans output is expected to
+	//   equal SamplingPercentage (as a ratio) times the number of
+	//   spans inpout, assuming good behavior from FNV and good
+	//   entropy in the hashed attributes or TraceID.
+	//
+	// - "equalizing": Using an OTel-specified consistent sampling
+	//   mechanism, this sampler selectively reduces the effective
+	//   sampling probability of arriving spans.  This can be
+	//   useful to select a small fraction of complete traces from
+	//   a stream with mixed sampling rates.  The rate of spans
+	//   passing through depends on how much sampling has already
+	//   been applied.  If an arriving span was head sampled at
+	//   the same probability it passes through.  If the span
+	//   arrives with lower probability, a warning is logged
+	//   because it means this sampler is configured with too
+	//   large a sampling probability to ensure complete traces.
+	//
+	// - "proportional": Using an OTel-specified consistent sampling
+	//   mechanism, this sampler reduces the effective sampling
+	//   probability of each span by `SamplingProbability`.
+	Mode SamplerMode `mapstructure:"mode"`
+
 	// FailClosed indicates to not sample data (the processor will
 	// fail "closed") in case of error, such as failure to parse
 	// the tracestate field or missing the randomness attribute.
@@ -45,6 +75,14 @@ type Config struct {
 	// despite errors using priority.
 	FailClosed bool `mapstructure:"fail_closed"`
 
+	// SamplingPrecision is how many hex digits of sampling
+	// threshold will be encoded, from 1 up to 14.  Default is 4.
+	// 0 is treated as full precision.
+	SamplingPrecision int `mapstructure:"sampling_precision"`
+
+	///////
+	// Logs only fields below.
+
 	// AttributeSource (logs only) defines where to look for the attribute in from_attribute. The allowed values are
 	// `traceID` or `record`. Default is `traceID`.
 	AttributeSource `mapstructure:"attribute_source"`
@@ -61,11 +99,34 @@ var _ component.Config = (*Config)(nil)
 
 // Validate checks if the processor configuration is valid
 func (cfg *Config) Validate() error {
-	if cfg.SamplingPercentage < 0 {
-		return fmt.Errorf("negative sampling rate: %.2f", cfg.SamplingPercentage)
+	pct := float64(cfg.SamplingPercentage)
+
+	if math.IsInf(pct, 0) || math.IsNaN(pct) {
+		return fmt.Errorf("sampling rate is invalid: %f%%", cfg.SamplingPercentage)
+	}
+	ratio := pct / 100.0
+
+	switch {
+	case ratio < 0:
+		return fmt.Errorf("sampling rate is negative: %f%%", cfg.SamplingPercentage)
+	case ratio == 0:
+		// Special case
+	case ratio < sampling.MinSamplingProbability:
+		// Too-small case
+		return fmt.Errorf("sampling rate is too small: %g%%", cfg.SamplingPercentage)
+	default:
+		// Note that ratio > 1 is specifically allowed by the README, taken to mean 100%
 	}
+
 	if cfg.AttributeSource != "" && !validAttributeSource[cfg.AttributeSource] {
 		return fmt.Errorf("invalid attribute source: %v. Expected: %v or %v", cfg.AttributeSource, traceIDAttributeSource, recordAttributeSource)
 	}
+
+	if cfg.SamplingPrecision == 0 {
+		return fmt.Errorf("invalid sampling precision: 0")
+	} else if cfg.SamplingPrecision > sampling.NumHexDigits {
+		return fmt.Errorf("sampling precision is too great, should be <= 14: %d", cfg.SamplingPrecision)
+	}
+
 	return nil
 }
@@ -26,6 +26,8 @@ func TestLoadConfig(t *testing.T) {
 			id: component.NewIDWithName(metadata.Type, ""),
 			expected: &Config{
 				SamplingPercentage: 15.3,
+				SamplingPrecision:  4,
+				Mode:               "proportional",
 				AttributeSource:    "traceID",
 				FailClosed:         true,
 			},
@@ -34,7 +36,9 @@ func TestLoadConfig(t *testing.T) {
 			id: component.NewIDWithName(metadata.Type, "logs"),
 			expected: &Config{
 				SamplingPercentage: 15.3,
+				SamplingPrecision:  defaultPrecision,
 				HashSeed:           22,
+				Mode:               "",
 				AttributeSource:    "record",
 				FromAttribute:      "foo",
 				SamplingPriority:   "bar",
@@ -68,7 +72,11 @@ func TestLoadInvalidConfig(t *testing.T) {
 		file     string
 		contains string
 	}{
-		{"invalid_negative.yaml", "negative sampling rate"},
+		{"invalid_negative.yaml", "sampling rate is negative"},
+		{"invalid_small.yaml", "sampling rate is too small"},
+		{"invalid_inf.yaml", "sampling rate is invalid: +Inf%"},
+		{"invalid_prec.yaml", "sampling precision is too great"},
+		{"invalid_zero.yaml", "invalid sampling precision"},
 	} {
 		t.Run(test.file, func(t *testing.T) {
 			factories, err := otelcoltest.NopFactories()

@@ -40,8 +40,10 @@ func NewFactory() processor.Factory {
 
 func createDefaultConfig() component.Config {
 	return &Config{
-		AttributeSource: defaultAttributeSource,
-		FailClosed:      true,
+		AttributeSource:   defaultAttributeSource,
+		FailClosed:        true,
+		Mode:              modeUnset,
+		SamplingPrecision: defaultPrecision,
 	}
 }