Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .chloggen/ottl-truncate-all-utf8.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: breaking

# The name of the component, or a single word describing the area of concern, (e.g. receiver/filelog)
component: pkg/ottl

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: "`truncate_all` function now supports UTF-8 safe truncation"

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [36713]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: |
The default `truncate_all` behavior has changed. Truncation now respects UTF-8 character boundaries by default (new optional parameter `utf8_safe`, default: `true`), so results stay valid UTF-8 and may be slightly shorter than the limit.
To keep the previous byte-level truncation behavior (e.g. for non-UTF-8 data or to avoid any behavior change), set `utf8_safe` to `false` in all `truncate_all` usages.

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: []
14 changes: 8 additions & 6 deletions pkg/ottl/ottlfuncs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -448,20 +448,22 @@ Examples:

### truncate_all

`truncate_all(target, limit)`
`truncate_all(target, limit, Optional[utf8_safe])`

The `truncate_all` function truncates all string values in a `pcommon.Map` so that none are longer than the limit.

`target` is a path expression to a `pcommon.Map` type field. `limit` is a non-negative integer.
`target` is a path expression to a `pcommon.Map` type field. `limit` is a non-negative integer representing the maximum number of bytes. `utf8_safe` is an optional boolean (default: `true`) that enables UTF-8 aware truncation.

The map will be mutated such that the number of characters in all string values is less than or equal to the limit. Non-string values are ignored.
The map will be mutated such that the number of bytes in all string values is less than or equal to the limit. Non-string values are ignored.

Examples:
This function treats input as valid UTF-8. Truncation is done only at UTF-8 character boundaries so that multi-byte characters are never cut in the middle and the result is always valid UTF-8. If cutting at exactly the `limit` would split a multi-byte character, the string is cut earlier, so the result may be slightly shorter than `limit`.

- `truncate_all(log.attributes, 100)`
When `utf8_safe` is set to `false`, truncation is applied at the byte limit only. Multi-byte UTF-8 characters may be split and the result can be invalid UTF-8. This mode is faster but should only be used when preserving valid UTF-8 is not required.

Examples:

- `truncate_all(resource.attributes, 50)`
- `truncate_all(log.attributes, 100)`
- `truncate_all(resource.attributes, 50, false)`

## Converters

Expand Down
26 changes: 17 additions & 9 deletions pkg/ottl/ottlfuncs/func_truncate_all.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ import (
"context"
"errors"
"fmt"
"unicode/utf8"

"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl"
)

type TruncateAllArguments[K any] struct {
Target ottl.PMapGetSetter[K]
Limit int64
Target ottl.PMapGetSetter[K]
Limit int64
Utf8Safe ottl.Optional[bool]
}

func NewTruncateAllFactory[K any]() ottl.Factory[K] {
Expand All @@ -27,26 +29,32 @@ func createTruncateAllFunction[K any](_ ottl.FunctionContext, oArgs ottl.Argumen
return nil, errors.New("TruncateAllFactory args must be of type *TruncateAllArguments[K]")
}

return TruncateAll(args.Target, args.Limit)
return TruncateAll(args.Target, args.Limit, args.Utf8Safe)
}

func TruncateAll[K any](target ottl.PMapGetSetter[K], limit int64) (ottl.ExprFunc[K], error) {
func TruncateAll[K any](target ottl.PMapGetSetter[K], limit int64, utf8Safe ottl.Optional[bool]) (ottl.ExprFunc[K], error) {
if limit < 0 {
return nil, fmt.Errorf("invalid limit for truncate_all function, %d cannot be negative", limit)
}
return func(ctx context.Context, tCtx K) (any, error) {
if limit < 0 {
return nil, nil
}

useUTF8Safe := utf8Safe.GetOr(true)

return func(ctx context.Context, tCtx K) (any, error) {
val, err := target.Get(ctx, tCtx)
if err != nil {
return nil, err
}
for _, value := range val.All() {
stringVal := value.Str()
if int64(len(stringVal)) > limit {
value.SetStr(stringVal[:limit])
truncateAt := int(limit)
Comment thread
edmocosta marked this conversation as resolved.
if useUTF8Safe {
// Back up to a valid UTF-8 boundary if we're in the middle of a rune
for truncateAt > 0 && !utf8.RuneStart(stringVal[truncateAt]) {
truncateAt--
}
}
value.SetStr(stringVal[:truncateAt])
}
}
// TODO: Write log when truncation is performed
Expand Down
78 changes: 74 additions & 4 deletions pkg/ottl/ottlfuncs/func_truncate_all_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ func Test_truncateAll(t *testing.T) {
},
}

exprFunc, err := TruncateAll(target, tt.limit)
exprFunc, err := TruncateAll(target, tt.limit, ottl.Optional[bool]{})
require.NoError(t, err)

_, err = exprFunc(nil, scenarioMap)
Expand All @@ -98,8 +98,78 @@ func Test_truncateAll(t *testing.T) {
}
}

func Test_truncateAll_UTF8(t *testing.T) {
tests := []struct {
name string
input string
limit int64
expect string
}{
{
name: "mid-rune truncation backs up to boundary",
input: "ab😀c", // 'ab' (2) + emoji (4) + 'c' (1) = 7 bytes
limit: 4, // cuts inside emoji, backs up to 'ab'
expect: "ab",
},
{
name: "exact rune boundary preserved",
input: "ab😀c",
limit: 6, // exactly after emoji
expect: "ab😀",
},
{
// Grapheme cluster: "👩🏾‍🦳" (woman with white hair) is 1 visible character
// but consists of 4 Unicode code points (runes):
// 👩 (woman) = 4 bytes (f0 9f 91 a9)
// 🏾 (skin tone) = 4 bytes (f0 9f 8f be)
// ‍ (zero-width joiner) = 3 bytes (e2 80 8d)
// 🦳 (white hair) = 4 bytes (f0 9f a6 b3)
// Total: 15 bytes, 4 runes, 1 visible character
// Truncating at limit=10 lands in the middle of the byte 9-11,
// so we back up to byte 8 (end of skin tone modifier).
// Result is valid UTF-8 but a split grapheme cluster.
name: "grapheme cluster truncates at rune boundary not grapheme boundary",
input: "👩🏾‍🦳",
limit: 10,
expect: "👩🏾",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
scenarioMap := pcommon.NewMap()
scenarioMap.PutStr("test", tt.input)

setterWasCalled := false
target := &ottl.StandardPMapGetSetter[pcommon.Map]{
Getter: func(_ context.Context, tCtx pcommon.Map) (pcommon.Map, error) {
return tCtx, nil
},
Setter: func(_ context.Context, tCtx pcommon.Map, m any) error {
setterWasCalled = true
if v, ok := m.(pcommon.Map); ok {
v.CopyTo(tCtx)
return nil
}
return errors.New("expected pcommon.Map")
},
}

utf8SafeOpt := ottl.NewTestingOptional(true)
exprFunc, err := TruncateAll(target, tt.limit, utf8SafeOpt)
require.NoError(t, err)

_, err = exprFunc(nil, scenarioMap)
require.NoError(t, err)
assert.True(t, setterWasCalled)

result, _ := scenarioMap.Get("test")
assert.Equal(t, tt.expect, result.Str())
})
}
}

func Test_truncateAll_validation(t *testing.T) {
_, err := TruncateAll[any](&ottl.StandardPMapGetSetter[any]{}, -1)
_, err := TruncateAll[any](&ottl.StandardPMapGetSetter[any]{}, -1, ottl.Optional[bool]{})
require.Error(t, err)
assert.ErrorContains(t, err, "invalid limit for truncate_all function, -1 cannot be negative")
}
Expand All @@ -115,7 +185,7 @@ func Test_truncateAll_bad_input(t *testing.T) {
},
}

exprFunc, err := TruncateAll[any](target, 1)
exprFunc, err := TruncateAll[any](target, 1, ottl.Optional[bool]{})
require.NoError(t, err)

_, err = exprFunc(nil, input)
Expand All @@ -132,7 +202,7 @@ func Test_truncateAll_get_nil(t *testing.T) {
},
}

exprFunc, err := TruncateAll[any](target, 1)
exprFunc, err := TruncateAll[any](target, 1, ottl.Optional[bool]{})
require.NoError(t, err)

_, err = exprFunc(nil, nil)
Expand Down
Loading