diff --git a/arrow/endian/big.go b/arrow/endian/big.go index 0b925857..9dfc76c0 100644 --- a/arrow/endian/big.go +++ b/arrow/endian/big.go @@ -19,12 +19,22 @@ package endian -import "encoding/binary" - -var Native = binary.BigEndian +import "math/bits" const ( IsBigEndian = true NativeEndian = BigEndian NonNativeEndian = LittleEndian ) + +func FromLE[T uint16 | uint32 | uint64](x T) T { + switch v := any(x).(type) { + case uint16: + return T(bits.Reverse16(v)) + case uint32: + return T(bits.Reverse32(v)) + case uint64: + return T(bits.Reverse64(v)) + } + return x +} diff --git a/arrow/endian/endian.go b/arrow/endian/endian.go index f369945d..ad2b1085 100644 --- a/arrow/endian/endian.go +++ b/arrow/endian/endian.go @@ -17,10 +17,14 @@ package endian import ( + "encoding/binary" + "github.com/apache/arrow-go/v18/arrow/internal/debug" "github.com/apache/arrow-go/v18/arrow/internal/flatbuf" ) +var Native = binary.NativeEndian + type Endianness flatbuf.Endianness const ( diff --git a/arrow/endian/little.go b/arrow/endian/little.go index def1fc64..12fbed30 100644 --- a/arrow/endian/little.go +++ b/arrow/endian/little.go @@ -19,12 +19,12 @@ package endian -import "encoding/binary" - -var Native = binary.LittleEndian - const ( IsBigEndian = false NativeEndian = LittleEndian NonNativeEndian = BigEndian ) + +func FromLE[T uint16 | uint32 | uint64](x T) T { + return x +} diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index e254fd7a..adad69e0 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -34,3 +34,5 @@ parquet/internal/gen-go/parquet/GoUnusedProtection__.go parquet/internal/gen-go/parquet/parquet-consts.go parquet/internal/gen-go/parquet/parquet.go parquet/version_string.go +parquet/variant/basic_type_stringer.go +parquet/variant/primitive_type_stringer.go diff --git a/go.mod b/go.mod index bf9c6888..5f1df00a 100644 --- a/go.mod +++ b/go.mod @@ -43,9 +43,9 @@ require ( github.com/tidwall/sjson v1.2.5 github.com/zeebo/xxh3 v1.0.2 golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 - golang.org/x/sync v0.13.0 - golang.org/x/sys v0.32.0 - golang.org/x/tools v0.32.0 + golang.org/x/sync v0.14.0 + golang.org/x/sys v0.33.0 + golang.org/x/tools v0.33.0 golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da gonum.org/v1/gonum v0.16.0 google.golang.org/grpc v1.72.0 @@ -66,6 +66,7 @@ require ( github.com/dustin/go-humanize v1.0.1 // indirect github.com/fatih/color v1.15.0 // indirect github.com/goccy/go-yaml v1.11.0 // indirect + github.com/google/go-cmp v0.7.0 // indirect github.com/gookit/color v1.5.4 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect github.com/json-iterator/go v1.1.12 // indirect @@ -88,9 +89,9 @@ require ( github.com/tidwall/pretty v1.2.0 // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect golang.org/x/mod v0.24.0 // indirect - golang.org/x/net v0.39.0 // indirect - golang.org/x/term v0.31.0 // indirect - golang.org/x/text v0.24.0 // indirect + golang.org/x/net v0.40.0 // indirect + golang.org/x/term v0.32.0 // indirect + golang.org/x/text v0.25.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a // indirect gopkg.in/yaml.v3 v3.0.1 // indirect modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 // indirect @@ -100,3 +101,4 @@ require ( modernc.org/strutil v1.2.0 // indirect modernc.org/token v1.1.0 // indirect ) + diff --git a/go.sum b/go.sum index 764cbcbf..068d9c2d 100644 --- a/go.sum +++ b/go.sum @@ -62,8 +62,8 @@ github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/flatbuffers v25.2.10+incompatible h1:F3vclr7C3HpB1k9mxCGRMXq6FdUalZ6H/pNX4FP1v0Q= github.com/google/flatbuffers v25.2.10+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbuBVKCudVG457BR2GZFIz3uw3hQ= github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo= @@ -198,8 +198,8 @@ go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= -golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= +golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8= +golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw= golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk= golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= @@ -210,13 +210,13 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= -golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= +golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY= +golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= -golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ= +golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -229,28 +229,28 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= -golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= +golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210615171337-6886f2dfbf5b/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= -golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= +golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= +golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= -golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= +golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4= +golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/tools v0.32.0 h1:Q7N1vhpkQv7ybVzLFtTjvQya2ewbwNDZzUgfXGqtMWU= -golang.org/x/tools v0.32.0/go.mod h1:ZxrU41P/wAbZD8EDa6dDCa6XfpkhJ7HFMjHJXfBDu8s= +golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= +golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da h1:noIWHXmPHxILtqtCOPIhSt0ABwskkZKjD3bXGnZGpNY= golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= diff --git a/parquet-testing b/parquet-testing index 39b91cf8..2dc8bf14 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 39b91cf853062d92f0d20581d37b20dabe70a6a0 +Subproject commit 2dc8bf140ed6e28652fc347211c7d661714c7f95 diff --git a/parquet/variant/basic_type_stringer.go b/parquet/variant/basic_type_stringer.go new file mode 100644 index 00000000..e8cf83e4 --- /dev/null +++ b/parquet/variant/basic_type_stringer.go @@ -0,0 +1,28 @@ +// Code generated by "stringer -type=BasicType -linecomment -output=basic_type_stringer.go"; DO NOT EDIT. + +package variant + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[BasicUndefined - -1] + _ = x[BasicPrimitive-0] + _ = x[BasicShortString-1] + _ = x[BasicObject-2] + _ = x[BasicArray-3] +} + +const _BasicType_name = "UnknownPrimitiveShortStringObjectArray" + +var _BasicType_index = [...]uint8{0, 7, 16, 27, 33, 38} + +func (i BasicType) String() string { + i -= -1 + if i < 0 || i >= BasicType(len(_BasicType_index)-1) { + return "BasicType(" + strconv.FormatInt(int64(i+-1), 10) + ")" + } + return _BasicType_name[_BasicType_index[i]:_BasicType_index[i+1]] +} diff --git a/parquet/variant/builder.go b/parquet/variant/builder.go new file mode 100644 index 00000000..e57770b7 --- /dev/null +++ b/parquet/variant/builder.go @@ -0,0 +1,850 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package variant + +import ( + "bytes" + "cmp" + "encoding/binary" + "errors" + "fmt" + "io" + "math" + "reflect" + "slices" + "strings" + "time" + "unsafe" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/decimal" + "github.com/google/uuid" +) + +// Builder is used to construct Variant values by appending data of various types. +// It manages an internal buffer for the value data and a dictionary for field keys. +type Builder struct { + buf bytes.Buffer + dict map[string]uint32 + dictKeys [][]byte + totalDictSize int + allowDuplicates bool +} + +// SetAllowDuplicates controls whether duplicate keys are allowed in objects. +// When true, the last value for a key is used. When false, an error is returned +// if a duplicate key is detected. +func (b *Builder) SetAllowDuplicates(allow bool) { + b.allowDuplicates = allow +} + +// AddKey adds a key to the builder's dictionary and returns its ID. +// If the key already exists in the dictionary, its existing ID is returned. +func (b *Builder) AddKey(key string) (id uint32) { + if b.dict == nil { + b.dict = make(map[string]uint32) + b.dictKeys = make([][]byte, 0, 16) + } + + var ok bool + if id, ok = b.dict[key]; ok { + return id + } + + id = uint32(len(b.dictKeys)) + b.dict[key] = id + b.dictKeys = append(b.dictKeys, unsafe.Slice(unsafe.StringData(key), len(key))) + b.totalDictSize += len(key) + + return id +} + +// AppendOpt represents options for appending time-related values. These are only +// used when using the generic Append method that takes an interface{}. +type AppendOpt int16 + +const ( + // OptTimestampNano specifies that timestamps should use nanosecond precision, + // otherwise microsecond precision is used. + OptTimestampNano AppendOpt = 1 << iota + // OptTimestampUTC specifies that timestamps should be in UTC timezone, otherwise + // no time zone (NTZ) is used. + OptTimestampUTC + // OptTimeAsDate specifies that time.Time values should be encoded as dates + OptTimeAsDate + // OptTimeAsTime specifies that time.Time values should be encoded as a time value + OptTimeAsTime +) + +func extractFieldInfo(f reflect.StructField) (name string, o AppendOpt) { + tag := f.Tag.Get("variant") + if tag == "" { + return f.Name, 0 + } + + parts := strings.Split(tag, ",") + if len(parts) == 1 { + return parts[0], 0 + } + + name = parts[0] + if name == "" { + name = f.Name + } + + for _, opt := range parts[1:] { + switch strings.ToLower(opt) { + case "nanos": + o |= OptTimestampNano + case "utc": + o |= OptTimestampUTC + case "date": + o |= OptTimeAsDate + case "time": + o |= OptTimeAsTime + } + } + + return name, o +} + +// Append adds a value of any supported type to the builder. +// +// Any basic primitive type is supported, the AppendOpt options are used to control how +// timestamps are appended (e.g., as microseconds or nanoseconds and timezone). The options +// also control how a [time.Time] value is appended (e.g., as a date, timestamp, or time). +// +// Appending a value with type `[]any` will construct an array appropriately, appending +// each element. Calling with a map[string]any will construct an object, recursively calling +// Append for each value, propagating the options. +// +// For other types (arbitrary slices, arrays, maps and structs), reflection is used to determine +// the type and whether we can append it. A nil pointer will append a null, while a non-nil +// pointer will append the value that it points to. +// +// For structs, field tags can be used to control the field names and options. Only exported +// fields are considered, with the field name being used as the key. A struct tag of `variant` +// can be used with the following format and options: +// +// type MyStruct struct { +// Field1 string `variant:"key"` // Use "key" instead of "Field1" as the field name +// Field2 time.Time `variant:"day,date"` // Use "day" instead of "Field2" as the field name +// // append this value as a "date" value +// Time time.Time `variant:",time"` // Use "Time" as the field name, append the value as +// // a "time" value +// Field3 int `variant:"-"` // Ignore this field +// Timestamp time.Time `variant:"ts"` // Use "ts" as the field name, append value as a +// // timestamp(UTC=false,MICROS) +// Ts2 time.Time `variant:"ts2,nanos,utc"` // Use "ts2" as the field name, append value as a +// // timestamp(UTC=true,NANOS) +// } +// +// There is only one case where options can conflict currently: If both [OptTimeAsDate] and +// [OptTimeAsTime] are set, then [OptTimeAsDate] will take precedence. +// +// Options specified in the struct tags will be OR'd with any options passed to the original call +// to Append. As a result, if a Struct field tag sets [OptTimeAsTime], but the call to Append +// passes [OptTimeAsDate], then the value will be appended as a date since that option takes +// precedence. +func (b *Builder) Append(v any, opts ...AppendOpt) error { + var o AppendOpt + for _, opt := range opts { + o |= opt + } + + return b.append(v, o) +} + +func (b *Builder) append(v any, o AppendOpt) error { + switch v := v.(type) { + case nil: + return b.AppendNull() + case bool: + return b.AppendBool(v) + case int8: + return b.AppendInt(int64(v)) + case uint8: + return b.AppendInt(int64(v)) + case int16: + return b.AppendInt(int64(v)) + case uint16: + return b.AppendInt(int64(v)) + case int32: + return b.AppendInt(int64(v)) + case uint32: + return b.AppendInt(int64(v)) + case int64: + return b.AppendInt(v) + case int: + return b.AppendInt(int64(v)) + case uint: + return b.AppendInt(int64(v)) + case float32: + return b.AppendFloat32(v) + case float64: + return b.AppendFloat64(v) + case arrow.Date32: + return b.AppendDate(v) + case arrow.Time64: + return b.AppendTimeMicro(v) + case arrow.Timestamp: + return b.AppendTimestamp(v, o&OptTimestampNano == 0, o&OptTimestampUTC != 0) + case []byte: + return b.AppendBinary(v) + case string: + return b.AppendString(v) + case uuid.UUID: + return b.AppendUUID(v) + case time.Time: + switch { + case o&OptTimeAsDate != 0: + return b.AppendDate(arrow.Date32FromTime(v)) + case o&OptTimeAsTime != 0: + t := v.Sub(v.Truncate(24 * time.Hour)) + return b.AppendTimeMicro(arrow.Time64(t.Microseconds())) + default: + unit := arrow.Microsecond + if o&OptTimestampNano != 0 { + unit = arrow.Nanosecond + } + + if o&OptTimestampUTC != 0 { + v = v.UTC() + } + + t, err := arrow.TimestampFromTime(v, unit) + if err != nil { + return err + } + + return b.AppendTimestamp(t, o&OptTimestampNano == 0, o&OptTimestampUTC != 0) + } + case DecimalValue[decimal.Decimal32]: + return b.AppendDecimal4(v.Scale, v.Value.(decimal.Decimal32)) + case DecimalValue[decimal.Decimal64]: + return b.AppendDecimal8(v.Scale, v.Value.(decimal.Decimal64)) + case DecimalValue[decimal.Decimal128]: + return b.AppendDecimal16(v.Scale, v.Value.(decimal.Decimal128)) + case []any: + start, offsets := b.Offset(), make([]int, 0, len(v)) + for _, item := range v { + offsets = append(offsets, b.NextElement(start)) + if err := b.append(item, o); err != nil { + return err + } + } + return b.FinishArray(start, offsets) + case map[string]any: + start, fields := b.Offset(), make([]FieldEntry, 0, len(v)) + for key, item := range v { + fields = append(fields, b.NextField(start, key)) + if err := b.append(item, o); err != nil { + return err + } + } + return b.FinishObject(start, fields) + default: + // attempt to use reflection before we give up! + val := reflect.ValueOf(v) + switch val.Kind() { + case reflect.Pointer, reflect.Interface: + if val.IsNil() { + return b.AppendNull() + } + return b.append(val.Elem().Interface(), o) + case reflect.Array, reflect.Slice: + start, offsets := b.Offset(), make([]int, 0, val.Len()) + for _, item := range val.Seq2() { + offsets = append(offsets, b.NextElement(start)) + if err := b.append(item.Interface(), o); err != nil { + return err + } + } + return b.FinishArray(start, offsets) + case reflect.Map: + if val.Type().Key().Kind() != reflect.String { + return fmt.Errorf("unsupported map key type: %s", val.Type().Key()) + } + + start, fields := b.Offset(), make([]FieldEntry, 0, val.Len()) + for k, v := range val.Seq2() { + fields = append(fields, b.NextField(start, k.String())) + if err := b.append(v.Interface(), o); err != nil { + return err + } + } + return b.FinishObject(start, fields) + case reflect.Struct: + start, fields := b.Offset(), make([]FieldEntry, 0, val.NumField()) + + typ := val.Type() + for i := range typ.NumField() { + f := typ.Field(i) + if !f.IsExported() { + continue + } + + name, opt := extractFieldInfo(f) + if name == "-" { + continue + } + + fields = append(fields, b.NextField(start, name)) + if err := b.append(val.Field(i).Interface(), o|opt); err != nil { + return err + } + } + return b.FinishObject(start, fields) + } + } + return fmt.Errorf("cannot append unsupported type to variant: %T", v) +} + +// AppendNull appends a null value to the builder. +func (b *Builder) AppendNull() error { + return b.buf.WriteByte(primitiveHeader(PrimitiveNull)) +} + +// AppendBool appends a boolean value to the builder. +func (b *Builder) AppendBool(v bool) error { + var t PrimitiveType + if v { + t = PrimitiveBoolTrue + } else { + t = PrimitiveBoolFalse + } + + return b.buf.WriteByte(primitiveHeader(t)) +} + +type primitiveNumeric interface { + int8 | int16 | int32 | int64 | float32 | float64 | + arrow.Date32 | arrow.Time64 +} + +type buffer interface { + io.Writer + io.ByteWriter +} + +func writeBinary[T string | []byte](w buffer, v T) error { + var t PrimitiveType + switch any(v).(type) { + case string: + t = PrimitiveString + case []byte: + t = PrimitiveBinary + } + + if err := w.WriteByte(primitiveHeader(t)); err != nil { + return err + } + + if err := binary.Write(w, binary.LittleEndian, uint32(len(v))); err != nil { + return err + } + + _, err := w.Write([]byte(v)) + return err +} + +func writeNumeric[T primitiveNumeric](w buffer, v T) error { + var t PrimitiveType + switch any(v).(type) { + case int8: + t = PrimitiveInt8 + case int16: + t = PrimitiveInt16 + case int32: + t = PrimitiveInt32 + case int64: + t = PrimitiveInt64 + case float32: + t = PrimitiveFloat + case float64: + t = PrimitiveDouble + case arrow.Date32: + t = PrimitiveDate + case arrow.Time64: + t = PrimitiveTimeMicrosNTZ + } + + if err := w.WriteByte(primitiveHeader(t)); err != nil { + return err + } + + return binary.Write(w, binary.LittleEndian, v) +} + +// AppendInt appends an integer value to the builder, using the smallest +// possible integer representation based on the value's range. +func (b *Builder) AppendInt(v int64) error { + b.buf.Grow(9) + switch { + case v >= math.MinInt8 && v <= math.MaxInt8: + return writeNumeric(&b.buf, int8(v)) + case v >= math.MinInt16 && v <= math.MaxInt16: + return writeNumeric(&b.buf, int16(v)) + case v >= math.MinInt32 && v <= math.MaxInt32: + return writeNumeric(&b.buf, int32(v)) + default: + return writeNumeric(&b.buf, v) + } +} + +// AppendFloat32 appends a 32-bit floating point value to the builder. +func (b *Builder) AppendFloat32(v float32) error { + b.buf.Grow(5) + return writeNumeric(&b.buf, v) +} + +// AppendFloat64 appends a 64-bit floating point value to the builder. +func (b *Builder) AppendFloat64(v float64) error { + b.buf.Grow(9) + return writeNumeric(&b.buf, v) +} + +// AppendDate appends a date value to the builder. +func (b *Builder) AppendDate(v arrow.Date32) error { + b.buf.Grow(5) + return writeNumeric(&b.buf, v) +} + +// AppendTimeMicro appends a time value with microsecond precision to the builder. +func (b *Builder) AppendTimeMicro(v arrow.Time64) error { + b.buf.Grow(9) + return writeNumeric(&b.buf, v) +} + +// AppendTimestamp appends a timestamp value to the builder. +// The useMicros parameter controls whether microsecond or nanosecond precision is used. +// The useUTC parameter controls whether the timestamp is in UTC timezone or has no time zone (NTZ). +func (b *Builder) AppendTimestamp(v arrow.Timestamp, useMicros, useUTC bool) error { + b.buf.Grow(9) + var t PrimitiveType + if useMicros { + t = PrimitiveTimestampMicrosNTZ + } else { + t = PrimitiveTimestampNanosNTZ + } + + if useUTC { + t-- + } + + if err := b.buf.WriteByte(primitiveHeader(t)); err != nil { + return err + } + + return binary.Write(&b.buf, binary.LittleEndian, v) +} + +// AppendBinary appends a binary value to the builder. +func (b *Builder) AppendBinary(v []byte) error { + b.buf.Grow(5 + len(v)) + return writeBinary(&b.buf, v) +} + +// AppendString appends a string value to the builder. +// Small strings are encoded using the short string representation if small enough. +func (b *Builder) AppendString(v string) error { + if len(v) > maxShortStringSize { + b.buf.Grow(5 + len(v)) + return writeBinary(&b.buf, v) + } + + b.buf.Grow(1 + len(v)) + if err := b.buf.WriteByte(shortStrHeader(len(v))); err != nil { + return err + } + + _, err := b.buf.WriteString(v) + return err +} + +// AppendUUID appends a UUID value to the builder. +func (b *Builder) AppendUUID(v uuid.UUID) error { + b.buf.Grow(17) + if err := b.buf.WriteByte(primitiveHeader(PrimitiveUUID)); err != nil { + return err + } + + m, _ := v.MarshalBinary() + _, err := b.buf.Write(m) + return err +} + +// AppendDecimal4 appends a 4-byte decimal value with the specified scale to the builder. +func (b *Builder) AppendDecimal4(scale uint8, v decimal.Decimal32) error { + b.buf.Grow(6) + if err := b.buf.WriteByte(primitiveHeader(PrimitiveDecimal4)); err != nil { + return err + } + + if err := b.buf.WriteByte(scale); err != nil { + return err + } + + return binary.Write(&b.buf, binary.LittleEndian, int32(v)) +} + +// AppendDecimal8 appends a 8-byte decimal value with the specified scale to the builder. +func (b *Builder) AppendDecimal8(scale uint8, v decimal.Decimal64) error { + b.buf.Grow(10) + return errors.Join( + b.buf.WriteByte(primitiveHeader(PrimitiveDecimal8)), + b.buf.WriteByte(scale), + binary.Write(&b.buf, binary.LittleEndian, int64(v)), + ) +} + +// AppendDecimal16 appends a 16-byte decimal value with the specified scale to the builder. +func (b *Builder) AppendDecimal16(scale uint8, v decimal.Decimal128) error { + b.buf.Grow(18) + return errors.Join( + b.buf.WriteByte(primitiveHeader(PrimitiveDecimal16)), + b.buf.WriteByte(scale), + binary.Write(&b.buf, binary.LittleEndian, v.LowBits()), + binary.Write(&b.buf, binary.LittleEndian, v.HighBits()), + ) +} + +// Offset returns the current offset in the builder's buffer. Generally used for +// grabbing a starting point for building an array or object. +func (b *Builder) Offset() int { + return b.buf.Len() +} + +// NextElement returns the offset of the next element relative to the start position. +// Use when building arrays to track element positions. The following creates a variant +// equivalent to `[5, 10]`. +// +// var b variant.Builder +// start, offsets := b.Offset(), make([]int, 0) +// offsets = append(offsets, b.NextElement(start)) +// b.Append(5) +// offsets = append(offsets, b.NextElement(start)) +// b.Append(10) +// b.FinishArray(start, offsets) +// +// The value returned by this is equivalent to `b.Offset() - start`, as offsets are all +// relative to the start position. This allows for creating nested arrays, the following +// creates a variant equivalent to `[5, [10, 20], 30]`. +// +// var b variant.Builder +// start, offsets := b.Offset(), make([]int, 0) +// offsets = append(offsets, b.NextElement(start)) +// b.Append(5) +// offsets = append(offsets, b.NextElement(start)) +// +// nestedStart, nestedOffsets := b.Offset(), make([]int, 0) +// nestedOffsets = append(nestedOffsets, b.NextElement(nestedStart)) +// b.Append(10) +// nestedOffsets = append(nestedOffsets, b.NextElement(nestedStart)) +// b.Append(20) +// b.FinishArray(nestedStart, nestedOffsets) +// +// offsets = append(offsets, b.NextElement(start)) +// b.Append(30) +// b.FinishArray(start, offsets) +func (b *Builder) NextElement(start int) int { + return b.Offset() - start +} + +// FinishArray finalizes an array value in the builder. +// The start parameter is the offset where the array begins. +// The offsets parameter contains the offsets of each element in the array. See [Builder.NextElement] +// for examples of how to use this. +func (b *Builder) FinishArray(start int, offsets []int) error { + var ( + dataSize, sz = b.buf.Len() - start, len(offsets) + isLarge = sz > math.MaxUint8 + sizeBytes = 1 + ) + + if isLarge { + sizeBytes = 4 + } + + if dataSize < 0 { + return errors.New("invalid array size") + } + + offsetSize := intSize(dataSize) + headerSize := 1 + sizeBytes + (sz+1)*int(offsetSize) + + // shift the just written data to make room for the header section + b.buf.Grow(headerSize) + av := b.buf.AvailableBuffer() + if _, err := b.buf.Write(av[:headerSize]); err != nil { + return err + } + + bs := b.buf.Bytes() + copy(bs[start+headerSize:], bs[start:start+dataSize]) + + // populate the header + bs[start] = arrayHeader(isLarge, offsetSize) + writeOffset(bs[start+1:], sz, uint8(sizeBytes)) + + offsetsStart := start + 1 + sizeBytes + for i, off := range offsets { + writeOffset(bs[offsetsStart+i*int(offsetSize):], off, offsetSize) + } + writeOffset(bs[offsetsStart+sz*int(offsetSize):], dataSize, offsetSize) + + return nil +} + +// FieldEntry represents a field in an object, with its key, ID, and offset. +// Usually constructed by using [Builder.NextField] and then passed to [Builder.FinishObject]. +type FieldEntry struct { + Key string + ID uint32 + Offset int +} + +// NextField creates a new field entry for an object with the given key. +// The start parameter is the offset where the object begins. The following example would +// construct a variant equivalent to `{"key1": 5, "key2": 10}`. +// +// var b variant.Builder +// start, fields := b.Offset(), make([]variant.FieldEntry, 0) +// fields = append(fields, b.NextField(start, "key1")) +// b.Append(5) +// fields = append(fields, b.NextField(start, "key2")) +// b.Append(10) +// b.FinishObject(start, fields) +// +// This allows for creating nested objects, the following example would create a variant +// equivalent to `{"key1": 5, "key2": {"key3": 10, "key4": 20}, "key5": 30}`. +// +// var b variant.Builder +// start, fields := b.Offset(), make([]variant.FieldEntry, 0) +// fields = append(fields, b.NextField(start, "key1")) +// b.Append(5) +// fields = append(fields, b.NextField(start, "key2")) +// nestedStart, nestedFields := b.Offset(), make([]variant.FieldEntry, 0) +// nestedFields = append(nestedFields, b.NextField(nestedStart, "key3")) +// b.Append(10) +// nestedFields = append(nestedFields, b.NextField(nestedStart, "key4")) +// b.Append(20) +// b.FinishObject(nestedStart, nestedFields) +// fields = append(fields, b.NextField(start, "key5")) +// b.Append(30) +// b.FinishObject(start, fields) +// +// The offset value returned by this is equivalent to `b.Offset() - start`, as offsets are all +// relative to the start position. The key provided will be passed to the [Builder.AddKey] method +// to ensure that the key is added to the dictionary and an ID is assigned. It will re-use existing +// IDs if the key already exists in the dictionary. +func (b *Builder) NextField(start int, key string) FieldEntry { + id := b.AddKey(key) + return FieldEntry{ + Key: key, + ID: id, + Offset: b.Offset() - start, + } +} + +// FinishObject finalizes an object value in the builder. +// The start parameter is the offset where the object begins. +// The fields parameter contains the entries for each field in the object. See [Builder.NextField] +// for examples of how to use this. +// +// The fields are sorted by key before finalizing the object. If duplicate keys are found, +// the last value for a key is kept if [Builder.SetAllowDuplicates] is set to true. If false, +// an error is returned. +func (b *Builder) FinishObject(start int, fields []FieldEntry) error { + slices.SortFunc(fields, func(a, b FieldEntry) int { + return cmp.Compare(a.Key, b.Key) + }) + + sz := len(fields) + var maxID uint32 + if sz > 0 { + maxID = fields[0].ID + } + + // if a duplicate key is found, one of two things happens: + // - if allowDuplicates is true, then the field with the greatest + // offset value (the last appended field) is kept. + // - if allowDuplicates is false, then an error is returned + if b.allowDuplicates { + distinctPos := 0 + // maintain a list of distinct keys in-place + for i := 1; i < sz; i++ { + maxID = max(maxID, fields[i].ID) + if fields[i].ID == fields[i-1].ID { + // found a duplicate key. keep the + // field with a greater offset + if fields[distinctPos].Offset < fields[i].Offset { + fields[distinctPos].Offset = fields[i].Offset + } + } else { + // found distinct key, add field to the list + distinctPos++ + fields[distinctPos] = fields[i] + } + } + + if distinctPos+1 < len(fields) { + sz = distinctPos + 1 + // resize fields to size + fields = fields[:sz] + // sort the fields by offsets so that we can move the value + // data of each field to the new offset without overwriting the + // fields after it. + slices.SortFunc(fields, func(a, b FieldEntry) int { + return cmp.Compare(a.Offset, b.Offset) + }) + + buf := b.buf.Bytes() + curOffset := 0 + for i := range sz { + oldOffset := fields[i].Offset + fieldSize := valueSize(buf[start+oldOffset:]) + copy(buf[start+curOffset:], buf[start+oldOffset:start+oldOffset+fieldSize]) + fields[i].Offset = curOffset + curOffset += fieldSize + } + b.buf.Truncate(start + curOffset) + // change back to sort order by field keys to meet variant spec + slices.SortFunc(fields, func(a, b FieldEntry) int { + return cmp.Compare(a.Key, b.Key) + }) + } + } else { + for i := 1; i < sz; i++ { + maxID = max(maxID, fields[i].ID) + if fields[i].Key == fields[i-1].Key { + return fmt.Errorf("disallowed duplicate key found: %s", fields[i].Key) + } + } + } + + var ( + dataSize = b.buf.Len() - start + isLarge = sz > math.MaxUint8 + sizeBytes = 1 + ) + + if isLarge { + sizeBytes = 4 + } + + if dataSize < 0 { + return errors.New("invalid object size") + } + + idSize, offsetSize := intSize(int(maxID)), intSize(dataSize) + headerSize := 1 + sizeBytes + sz*int(idSize) + (sz+1)*int(offsetSize) + // shift the just written data to make room for the header section + b.buf.Grow(headerSize) + av := b.buf.AvailableBuffer() + if _, err := b.buf.Write(av[:headerSize]); err != nil { + return err + } + + bs := b.buf.Bytes() + copy(bs[start+headerSize:], bs[start:start+dataSize]) + + // populate the header + bs[start] = objectHeader(isLarge, idSize, offsetSize) + writeOffset(bs[start+1:], sz, uint8(sizeBytes)) + + idStart := start + 1 + sizeBytes + offsetStart := idStart + sz*int(idSize) + for i, field := range fields { + writeOffset(bs[idStart+i*int(idSize):], int(field.ID), idSize) + writeOffset(bs[offsetStart+i*int(offsetSize):], field.Offset, offsetSize) + } + writeOffset(bs[offsetStart+sz*int(offsetSize):], dataSize, offsetSize) + return nil +} + +// Reset truncates the builder's buffer and clears the dictionary while re-using the +// underlying storage where possible. This allows for reusing the builder while keeping +// the total memory usage low. The caveat to this is that any variant value returned +// by calling [Builder.Build] must be cloned with [Value.Clone] before calling this +// method. Otherwise, the byte slice used by the value will be invalidated upon calling +// this method. +// +// For trivial cases where the builder is not reused, this method never needs to be called, +// and the variant built by the builder gets to avoid having to copy the buffer, just referring +// to it directly. +func (b *Builder) Reset() { + b.buf.Reset() + b.dict = make(map[string]uint32) + for i := range b.dictKeys { + b.dictKeys[i] = nil + } + b.dictKeys = b.dictKeys[:0] +} + +// Build creates a Variant Value from the builder's current state. +// The returned Value includes both the value data and the metadata (dictionary). +// +// Importantly, the value data is the returned variant value is not copied here. This will +// return the raw buffer data owned by the builder's buffer. If you wish to reuse a builder, +// then the [Value.Clone] method must be called on the returned value to copy the data before +// calling [Builder.Reset]. This enables trivial cases that don't reuse the builder to avoid +// performing this copy. +func (b *Builder) Build() (Value, error) { + nkeys := len(b.dictKeys) + + // determine the number of bytes required per offset entry. + // the largest offset is the one-past-the-end value, the total size. + // It's very unlikely that the number of keys could be larger, but + // incorporate that into the calculation in case of pathological data. + maxSize := max(b.totalDictSize, nkeys) + if maxSize > metadataMaxSizeLimit { + return Value{}, fmt.Errorf("metadata size too large: %d", maxSize) + } + + offsetSize := intSize(int(maxSize)) + offsetStart := 1 + offsetSize + stringStart := int(offsetStart) + (nkeys+1)*int(offsetSize) + metadataSize := stringStart + b.totalDictSize + + if metadataSize > metadataMaxSizeLimit { + return Value{}, fmt.Errorf("metadata size too large: %d", metadataSize) + } + + meta := make([]byte, metadataSize) + + meta[0] = supportedVersion | ((offsetSize - 1) << 6) + if nkeys > 0 && slices.IsSortedFunc(b.dictKeys, bytes.Compare) { + meta[0] |= 1 << 4 + } + writeOffset(meta[1:], nkeys, offsetSize) + + curOffset := 0 + for i, k := range b.dictKeys { + writeOffset(meta[int(offsetStart)+i*int(offsetSize):], curOffset, offsetSize) + curOffset += copy(meta[stringStart+curOffset:], k) + } + writeOffset(meta[int(offsetStart)+nkeys*int(offsetSize):], curOffset, offsetSize) + + return Value{ + value: b.buf.Bytes(), + meta: Metadata{ + data: meta, + keys: b.dictKeys, + }, + }, nil +} diff --git a/parquet/variant/builder_test.go b/parquet/variant/builder_test.go new file mode 100644 index 00000000..21292f50 --- /dev/null +++ b/parquet/variant/builder_test.go @@ -0,0 +1,787 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package variant_test + +import ( + "encoding/json" + "testing" + "time" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/decimal" + "github.com/apache/arrow-go/v18/arrow/decimal128" + "github.com/apache/arrow-go/v18/parquet/variant" + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBuildNullValue(t *testing.T) { + var b variant.Builder + b.AppendNull() + + v, err := b.Build() + require.NoError(t, err) + + assert.Equal(t, variant.Null, v.Type()) + assert.EqualValues(t, 1, v.Metadata().Version()) + assert.Zero(t, v.Metadata().DictionarySize()) +} + +func TestBuildPrimitive(t *testing.T) { + tests := []struct { + name string + op func(*variant.Builder) error + }{ + {"primitive_boolean_true", func(b *variant.Builder) error { + return b.AppendBool(true) + }}, + {"primitive_boolean_false", func(b *variant.Builder) error { + return b.AppendBool(false) + }}, + // AppendInt will use the smallest possible int type + {"primitive_int8", func(b *variant.Builder) error { return b.AppendInt(42) }}, + {"primitive_int16", func(b *variant.Builder) error { return b.AppendInt(1234) }}, + {"primitive_int32", func(b *variant.Builder) error { return b.AppendInt(123456) }}, + // FIXME: https://github.com/apache/parquet-testing/issues/82 + // primitive_int64 is an int32 value, but the metadata is int64 + {"primitive_int64", func(b *variant.Builder) error { return b.AppendInt(12345678) }}, + {"primitive_float", func(b *variant.Builder) error { return b.AppendFloat32(1234568000) }}, + {"primitive_double", func(b *variant.Builder) error { return b.AppendFloat64(1234567890.1234) }}, + {"primitive_string", func(b *variant.Builder) error { + return b.AppendString(`This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!`) + }}, + {"short_string", func(b *variant.Builder) error { return b.AppendString(`Less than 64 bytes (❤️ with utf8)`) }}, + // 031337deadbeefcafe + {"primitive_binary", func(b *variant.Builder) error { + return b.AppendBinary([]byte{0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe}) + }}, + {"primitive_decimal4", func(b *variant.Builder) error { return b.AppendDecimal4(2, 1234) }}, + {"primitive_decimal8", func(b *variant.Builder) error { return b.AppendDecimal8(2, 1234567890) }}, + {"primitive_decimal16", func(b *variant.Builder) error { return b.AppendDecimal16(2, decimal128.FromU64(1234567891234567890)) }}, + {"primitive_date", func(b *variant.Builder) error { return b.AppendDate(20194) }}, + {"primitive_timestamp", func(b *variant.Builder) error { return b.AppendTimestamp(1744821296780000, true, true) }}, + {"primitive_timestampntz", func(b *variant.Builder) error { return b.AppendTimestamp(1744806896780000, true, false) }}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + expected := loadVariant(t, tt.name) + + var b variant.Builder + require.NoError(t, tt.op(&b)) + + v, err := b.Build() + require.NoError(t, err) + + assert.Equal(t, expected.Type(), v.Type()) + assert.Equal(t, expected.Bytes(), v.Bytes()) + assert.Equal(t, expected.Metadata().Bytes(), v.Metadata().Bytes()) + }) + } +} + +func TestBuildInt64(t *testing.T) { + var b variant.Builder + require.NoError(t, b.AppendInt(1234567890987654321)) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Int64, v.Type()) + assert.Equal(t, []byte{primitiveHeader(variant.PrimitiveInt64), + 0xB1, 0x1C, 0x6C, 0xB1, 0xF4, 0x10, 0x22, 0x11}, v.Bytes()) +} + +func TestBuildObject(t *testing.T) { + var b variant.Builder + start := b.Offset() + + fields := make([]variant.FieldEntry, 0, 7) + + fields = append(fields, b.NextField(start, "int_field")) + require.NoError(t, b.AppendInt(1)) + + fields = append(fields, b.NextField(start, "double_field")) + require.NoError(t, b.AppendDecimal4(8, 123456789)) + + fields = append(fields, b.NextField(start, "boolean_true_field")) + require.NoError(t, b.AppendBool(true)) + + fields = append(fields, b.NextField(start, "boolean_false_field")) + require.NoError(t, b.AppendBool(false)) + + fields = append(fields, b.NextField(start, "string_field")) + require.NoError(t, b.AppendString("Apache Parquet")) + + fields = append(fields, b.NextField(start, "null_field")) + require.NoError(t, b.AppendNull()) + + fields = append(fields, b.NextField(start, "timestamp_field")) + require.NoError(t, b.AppendString("2025-04-16T12:34:56.78")) + + require.NoError(t, b.FinishObject(start, fields)) + v, err := b.Build() + require.NoError(t, err) + + assert.Equal(t, variant.Object, v.Type()) + expected := loadVariant(t, "object_primitive") + + assert.Equal(t, expected.Metadata().DictionarySize(), v.Metadata().DictionarySize()) + assert.Equal(t, expected.Metadata().Bytes(), v.Metadata().Bytes()) + assert.Equal(t, expected.Bytes(), v.Bytes()) +} + +func TestBuildObjectDuplicateKeys(t *testing.T) { + t.Run("disallow duplicates", func(t *testing.T) { + var b variant.Builder + start := b.Offset() + + fields := make([]variant.FieldEntry, 0, 3) + + fields = append(fields, b.NextField(start, "int_field")) + require.NoError(t, b.AppendInt(1)) + + fields = append(fields, b.NextField(start, "int_field")) + require.NoError(t, b.AppendInt(2)) + + fields = append(fields, b.NextField(start, "int_field")) + require.NoError(t, b.AppendInt(3)) + + require.Error(t, b.FinishObject(start, fields)) + }) + + t.Run("allow duplicates", func(t *testing.T) { + var b variant.Builder + start := b.Offset() + + fields := make([]variant.FieldEntry, 0, 3) + + fields = append(fields, b.NextField(start, "int_field")) + require.NoError(t, b.AppendInt(1)) + + fields = append(fields, b.NextField(start, "string_field")) + require.NoError(t, b.AppendString("Apache Parquet")) + + fields = append(fields, b.NextField(start, "int_field")) + require.NoError(t, b.AppendInt(2)) + + fields = append(fields, b.NextField(start, "int_field")) + require.NoError(t, b.AppendInt(3)) + + fields = append(fields, b.NextField(start, "string_field")) + require.NoError(t, b.AppendString("Apache Arrow")) + + b.SetAllowDuplicates(true) + require.NoError(t, b.FinishObject(start, fields)) + + v, err := b.Build() + require.NoError(t, err) + + assert.Equal(t, variant.Object, v.Type()) + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `{ + "int_field": 3, + "string_field": "Apache Arrow" + }`, string(out)) + }) +} + +func TestBuildObjectNested(t *testing.T) { + var b variant.Builder + + start := b.Offset() + topFields := make([]variant.FieldEntry, 0, 3) + + topFields = append(topFields, b.NextField(start, "id")) + require.NoError(t, b.AppendInt(1)) + + topFields = append(topFields, b.NextField(start, "observation")) + + observeFields := make([]variant.FieldEntry, 0, 3) + observeStart := b.Offset() + observeFields = append(observeFields, b.NextField(observeStart, "location")) + require.NoError(t, b.AppendString("In the Volcano")) + observeFields = append(observeFields, b.NextField(observeStart, "time")) + require.NoError(t, b.AppendString("12:34:56")) + observeFields = append(observeFields, b.NextField(observeStart, "value")) + + valueStart := b.Offset() + valueFields := make([]variant.FieldEntry, 0, 2) + valueFields = append(valueFields, b.NextField(valueStart, "humidity")) + require.NoError(t, b.AppendInt(456)) + valueFields = append(valueFields, b.NextField(valueStart, "temperature")) + require.NoError(t, b.AppendInt(123)) + + require.NoError(t, b.FinishObject(valueStart, valueFields)) + require.NoError(t, b.FinishObject(observeStart, observeFields)) + + topFields = append(topFields, b.NextField(start, "species")) + speciesStart := b.Offset() + speciesFields := make([]variant.FieldEntry, 0, 2) + speciesFields = append(speciesFields, b.NextField(speciesStart, "name")) + require.NoError(t, b.AppendString("lava monster")) + + speciesFields = append(speciesFields, b.NextField(speciesStart, "population")) + require.NoError(t, b.AppendInt(6789)) + + require.NoError(t, b.FinishObject(speciesStart, speciesFields)) + require.NoError(t, b.FinishObject(start, topFields)) + + v, err := b.Build() + require.NoError(t, err) + + out, err := json.Marshal(v) + require.NoError(t, err) + + assert.JSONEq(t, `{ + "id": 1, + "observation": { + "location": "In the Volcano", + "time": "12:34:56", + "value": { + "humidity": 456, + "temperature": 123 + } + }, + "species": { + "name": "lava monster", + "population": 6789 + } + }`, string(out)) +} + +func TestBuildUUID(t *testing.T) { + u := uuid.MustParse("00112233-4455-6677-8899-aabbccddeeff") + + var b variant.Builder + require.NoError(t, b.AppendUUID(u)) + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.UUID, v.Type()) + assert.Equal(t, []byte{primitiveHeader(variant.PrimitiveUUID), + 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}, v.Bytes()) +} + +func TestBuildTimestampNanos(t *testing.T) { + t.Run("ts nanos tz negative", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.AppendTimestamp(-1, false, true)) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.TimestampNanos, v.Type()) + assert.Equal(t, []byte{primitiveHeader(variant.PrimitiveTimestampNanos), + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, v.Bytes()) + }) + + t.Run("ts nanos tz positive", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.AppendTimestamp(1744877350123456789, false, true)) + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.TimestampNanos, v.Type()) + assert.Equal(t, []byte{primitiveHeader(variant.PrimitiveTimestampNanos), + 0x15, 0xC9, 0xBB, 0x86, 0xB4, 0x0C, 0x37, 0x18}, v.Bytes()) + }) + + t.Run("ts nanos ntz positive", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.AppendTimestamp(1744877350123456789, false, false)) + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.TimestampNanosNTZ, v.Type()) + assert.Equal(t, []byte{primitiveHeader(variant.PrimitiveTimestampNanosNTZ), + 0x15, 0xC9, 0xBB, 0x86, 0xB4, 0x0C, 0x37, 0x18}, v.Bytes()) + }) +} + +func TestBuildArrayValues(t *testing.T) { + t.Run("array primitive", func(t *testing.T) { + var b variant.Builder + + start := b.Offset() + offsets := make([]int, 0, 4) + + offsets = append(offsets, b.NextElement(start)) + require.NoError(t, b.AppendInt(2)) + + offsets = append(offsets, b.NextElement(start)) + require.NoError(t, b.AppendInt(1)) + + offsets = append(offsets, b.NextElement(start)) + require.NoError(t, b.AppendInt(5)) + + offsets = append(offsets, b.NextElement(start)) + require.NoError(t, b.AppendInt(9)) + + require.NoError(t, b.FinishArray(start, offsets)) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Array, v.Type()) + + expected := loadVariant(t, "array_primitive") + assert.Equal(t, expected.Metadata().Bytes(), v.Metadata().Bytes()) + assert.Equal(t, expected.Bytes(), v.Bytes()) + }) + + t.Run("array empty", func(t *testing.T) { + var b variant.Builder + + require.NoError(t, b.FinishArray(b.Offset(), nil)) + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Array, v.Type()) + + expected := loadVariant(t, "array_empty") + assert.Equal(t, expected.Metadata().Bytes(), v.Metadata().Bytes()) + assert.Equal(t, expected.Bytes(), v.Bytes()) + }) + + t.Run("array nested", func(t *testing.T) { + var b variant.Builder + + start := b.Offset() + offsets := make([]int, 0, 3) + + { + offsets = append(offsets, b.NextElement(start)) + objStart := b.Offset() + objFields := make([]variant.FieldEntry, 0, 2) + objFields = append(objFields, b.NextField(objStart, "id")) + require.NoError(t, b.AppendInt(1)) + + objFields = append(objFields, b.NextField(objStart, "thing")) + thingObjStart := b.Offset() + thingObjFields := make([]variant.FieldEntry, 0, 1) + thingObjFields = append(thingObjFields, b.NextField(thingObjStart, "names")) + + namesStart := b.Offset() + namesOffsets := make([]int, 0, 2) + namesOffsets = append(namesOffsets, b.NextElement(namesStart)) + require.NoError(t, b.AppendString("Contrarian")) + namesOffsets = append(namesOffsets, b.NextElement(namesStart)) + require.NoError(t, b.AppendString("Spider")) + require.NoError(t, b.FinishArray(namesStart, namesOffsets)) + + require.NoError(t, b.FinishObject(thingObjStart, thingObjFields)) + require.NoError(t, b.FinishObject(objStart, objFields)) + } + { + offsets = append(offsets, b.NextElement(start)) + b.AppendNull() + } + { + offsets = append(offsets, b.NextElement(start)) + objStart := b.Offset() + objFields := make([]variant.FieldEntry, 0, 3) + objFields = append(objFields, b.NextField(objStart, "id")) + require.NoError(t, b.AppendInt(2)) + + objFields = append(objFields, b.NextField(objStart, "names")) + namesStart := b.Offset() + namesOffsets := make([]int, 0, 3) + namesOffsets = append(namesOffsets, b.NextElement(namesStart)) + require.NoError(t, b.AppendString("Apple")) + namesOffsets = append(namesOffsets, b.NextElement(namesStart)) + require.NoError(t, b.AppendString("Ray")) + namesOffsets = append(namesOffsets, b.NextElement(namesStart)) + require.NoError(t, b.AppendNull()) + require.NoError(t, b.FinishArray(namesStart, namesOffsets)) + + objFields = append(objFields, b.NextField(objStart, "type")) + require.NoError(t, b.AppendString("if")) + + require.NoError(t, b.FinishObject(objStart, objFields)) + } + + require.NoError(t, b.FinishArray(start, offsets)) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Array, v.Type()) + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `[ + {"id": 1, "thing": {"names": ["Contrarian", "Spider"]}}, + null, + {"id": 2, "names": ["Apple", "Ray", null], "type": "if"} + ]`, string(out)) + }) +} + +func TestAppendPrimitives(t *testing.T) { + tests := []struct { + name string + value any + valueOut any + expected variant.Type + }{ + {"null", nil, nil, variant.Null}, + {"bool_true", true, true, variant.Bool}, + {"bool_false", false, false, variant.Bool}, + {"int8", int8(42), int8(42), variant.Int8}, + {"uint8", uint8(42), int8(42), variant.Int8}, + {"int16", int16(1234), int16(1234), variant.Int16}, + {"uint16", uint16(1234), int16(1234), variant.Int16}, + {"int32", int32(123456), int32(123456), variant.Int32}, + {"uint32", uint32(123456), int32(123456), variant.Int32}, + {"int64", int64(1234567890123), int64(1234567890123), variant.Int64}, + {"int", int(123456), int32(123456), variant.Int32}, + {"uint", uint(123456), int32(123456), variant.Int32}, + {"float32", float32(123.45), float32(123.45), variant.Float}, + {"float64", float64(123.45), float64(123.45), variant.Double}, + {"string", "test string", "test string", variant.String}, + {"bytes", []byte{1, 2, 3, 4}, []byte{1, 2, 3, 4}, variant.Binary}, + {"date", arrow.Date32(2023), arrow.Date32(2023), variant.Date}, + {"timestamp", arrow.Timestamp(1234567890), arrow.Timestamp(1234567890), variant.TimestampMicrosNTZ}, + {"time", arrow.Time64(123456), arrow.Time64(123456), variant.Time}, + {"uuid", uuid.MustParse("00112233-4455-6677-8899-aabbccddeeff"), + uuid.MustParse("00112233-4455-6677-8899-aabbccddeeff"), variant.UUID}, + {"decimal4", variant.DecimalValue[decimal.Decimal32]{ + Scale: 2, Value: decimal.Decimal32(1234), + }, variant.DecimalValue[decimal.Decimal32]{ + Scale: 2, Value: decimal.Decimal32(1234), + }, variant.Decimal4}, + {"decimal8", variant.DecimalValue[decimal.Decimal64]{ + Scale: 2, Value: decimal.Decimal64(1234), + }, variant.DecimalValue[decimal.Decimal64]{ + Scale: 2, Value: decimal.Decimal64(1234), + }, variant.Decimal8}, + {"decimal16", variant.DecimalValue[decimal.Decimal128]{ + Scale: 2, Value: decimal128.FromU64(1234), + }, variant.DecimalValue[decimal.Decimal128]{ + Scale: 2, Value: decimal128.FromU64(1234), + }, variant.Decimal16}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.Append(tt.value)) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, tt.expected, v.Type()) + assert.Equal(t, tt.valueOut, v.Value()) + }) + } +} + +func TestAppendTimestampOptions(t *testing.T) { + testTime := time.Date(2023, 5, 15, 14, 30, 0, 123456789, time.UTC) + + tests := []struct { + name string + opts []variant.AppendOpt + expected variant.Type + }{ + {"default_micros", nil, variant.TimestampMicrosNTZ}, + {"nanos", []variant.AppendOpt{variant.OptTimestampNano}, variant.TimestampNanosNTZ}, + {"utc_micros", []variant.AppendOpt{variant.OptTimestampUTC}, variant.TimestampMicros}, + {"utc_nanos", []variant.AppendOpt{variant.OptTimestampUTC, variant.OptTimestampNano}, variant.TimestampNanos}, + {"as_date", []variant.AppendOpt{variant.OptTimeAsDate}, variant.Date}, + {"as_time", []variant.AppendOpt{variant.OptTimeAsTime}, variant.Time}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.Append(testTime, tt.opts...)) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, tt.expected, v.Type()) + }) + } +} + +func TestAppendArrays(t *testing.T) { + t.Run("slice_of_any", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.Append([]any{1, "test", true, nil})) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Array, v.Type()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `[1, "test", true, null]`, string(out)) + }) + + t.Run("slice_of_ints", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.Append([]int{10, 20, 30})) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Array, v.Type()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `[10, 20, 30]`, string(out)) + }) + + t.Run("nested_slices", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.Append([]any{ + []int{1, 2}, + []string{"a", "b"}, + })) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Array, v.Type()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `[[1, 2], ["a", "b"]]`, string(out)) + }) +} + +func TestAppendMaps(t *testing.T) { + t.Run("map_string_any", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.Append(map[string]any{ + "int": 123, + "str": "test", + "bool": true, + "null": nil, + })) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Object, v.Type()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `{"bool": true, "int": 123, "null": null, "str": "test"}`, string(out)) + }) + + t.Run("map_string_int", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.Append(map[string]int{ + "int": 123, + "int2": 456, + })) + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Object, v.Type()) + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `{"int": 123, "int2": 456}`, string(out)) + }) + + t.Run("map_with_nested_objects", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.Append(map[string]any{ + "metadata": map[string]any{ + "id": 1, + "name": "test", + }, + "values": []int{10, 20, 30}, + })) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Object, v.Type()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `{"metadata": {"id": 1, "name": "test"}, "values": [10, 20, 30]}`, string(out)) + }) + + t.Run("unsupported_map_key", func(t *testing.T) { + var b variant.Builder + err := b.Append(map[int]string{1: "test"}) + assert.Error(t, err) + assert.Contains(t, err.Error(), "unsupported map key type") + }) +} + +type SimpleStruct struct { + ID int + Name string + IsValid bool +} + +type StructWithTags struct { + ID int `variant:"id"` + Name string `variant:"name"` + Ignored string `variant:"-"` + Timestamp time.Time `variant:"ts,nanos,utc"` + Date time.Time `variant:"date,date"` + TimeOnly time.Time `variant:",time"` +} + +type NestedStruct struct { + ID int `variant:"id"` + Metadata *SimpleStruct `variant:"meta"` + Tags []string `variant:"tags"` +} + +func TestAppendStructs(t *testing.T) { + t.Run("simple_struct", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.Append(SimpleStruct{ + ID: 123, + Name: "test", + IsValid: true, + })) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Object, v.Type()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `{"ID": 123, "Name": "test", "IsValid": true}`, string(out)) + }) + + t.Run("struct_with_tags", func(t *testing.T) { + testTime := time.Date(2023, 5, 15, 14, 30, 0, 123456789, time.UTC) + var b variant.Builder + require.NoError(t, b.Append(StructWithTags{ + ID: 123, + Name: "test", + Ignored: "should not appear", + Timestamp: testTime, + Date: testTime, + TimeOnly: testTime, + })) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Object, v.Type()) + + obj := v.Value().(variant.ObjectValue) + id, err := obj.ValueByKey("id") + require.NoError(t, err) + assert.Equal(t, variant.Int8, id.Value.Type()) + assert.Equal(t, int8(123), id.Value.Value()) + + name, err := obj.ValueByKey("name") + require.NoError(t, err) + assert.Equal(t, variant.String, name.Value.Type()) + assert.Equal(t, "test", name.Value.Value()) + + ignored, err := obj.ValueByKey("Ignored") + require.ErrorIs(t, err, arrow.ErrNotFound) + assert.Zero(t, ignored) + + ts, err := obj.ValueByKey("ts") + require.NoError(t, err) + assert.Equal(t, variant.TimestampNanos, ts.Value.Type()) + assert.Equal(t, arrow.Timestamp(testTime.UnixNano()), ts.Value.Value()) + + date, err := obj.ValueByKey("date") + require.NoError(t, err) + assert.Equal(t, variant.Date, date.Value.Type()) + assert.Equal(t, arrow.Date32FromTime(testTime), date.Value.Value()) + + timeOnly, err := obj.ValueByKey("TimeOnly") + require.NoError(t, err) + assert.Equal(t, variant.Time, timeOnly.Value.Type()) + assert.Equal(t, arrow.Time64(52200123456), timeOnly.Value.Value()) + }) + + t.Run("nested_struct", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.Append(NestedStruct{ + ID: 123, + Metadata: &SimpleStruct{ + ID: 456, + Name: "nested", + IsValid: true, + }, + Tags: []string{"tag1", "tag2"}, + })) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Object, v.Type()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `{ + "id": 123, + "meta": {"ID": 456, "Name": "nested", "IsValid": true}, + "tags": ["tag1", "tag2"] + }`, string(out)) + }) + + t.Run("nil_struct_pointer", func(t *testing.T) { + var b variant.Builder + require.NoError(t, b.Append(NestedStruct{ + ID: 123, + Metadata: nil, + Tags: []string{"tag1"}, + })) + + v, err := b.Build() + require.NoError(t, err) + assert.Equal(t, variant.Object, v.Type()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `{"id": 123, "meta": null, "tags": ["tag1"]}`, string(out)) + }) +} + +func TestAppendReset(t *testing.T) { + var b variant.Builder + + // First build + require.NoError(t, b.Append(map[string]any{"key": "value"})) + v1, err := b.Build() + require.NoError(t, err) + + out1, err := json.Marshal(v1) + require.NoError(t, err) + assert.JSONEq(t, `{"key": "value"}`, string(out1)) + v1 = v1.Clone() + + // Reset and build again + b.Reset() + require.NoError(t, b.Append([]int{1, 2, 3})) + v2, err := b.Build() + require.NoError(t, err) + + // First value should still be valid because we cloned it + // before calling Reset + assert.Equal(t, variant.Object, v1.Type()) + out1, err = json.Marshal(v1) + require.NoError(t, err) + assert.JSONEq(t, `{"key": "value"}`, string(out1)) + + // Second value should be different + assert.Equal(t, variant.Array, v2.Type()) + out2, err := json.Marshal(v2) + require.NoError(t, err) + assert.JSONEq(t, `[1, 2, 3]`, string(out2)) + + // Without cloning, the first value would be invalidated + v1Clone := v1.Clone() + b.Reset() + + out3, err := json.Marshal(v1Clone) + require.NoError(t, err) + assert.JSONEq(t, `{"key": "value"}`, string(out3)) +} diff --git a/parquet/variant/doc.go b/parquet/variant/doc.go new file mode 100644 index 00000000..29c6b777 --- /dev/null +++ b/parquet/variant/doc.go @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package variant provides an implementation of the Apache Parquet Variant data type. +// +// The Variant type is a flexible binary format designed to represent complex nested +// data structures with minimal overhead. It supports a wide range of primitive types +// as well as nested arrays and objects (similar to JSON). The format uses a memory-efficient +// binary representation with a separate metadata section for dictionary encoding of keys. +// +// # Key Components +// +// - [Value]: The primary type representing a variant value +// - [Metadata]: Contains information about the dictionary of keys +// - [Builder]: Used to construct variant values +// +// # Format Overview +// +// The variant format consists of two parts: +// +// 1. Metadata: A dictionary of keys used in objects +// 2. Value: The actual data payload +// +// Values can be one of the following types: +// +// - Primitive values (null, bool, int8/16/32/64, float32/64, etc.) +// - Short strings (less than 64 bytes) +// - Long strings and binary data +// - Date, time and timestamp values +// - Decimal values (4, 8, or 16 bytes) +// - Arrays of any variant value +// - Objects with key-value pairs +// +// # Working with Variants +// +// To create a variant value, use the Builder: +// +// var b variant.Builder +// b.Append(map[string]any{ +// "id": 123, +// "name": "example", +// "data": []any{1, 2, 3}, +// }) +// value, err := b.Build() +// +// To parse an existing variant value: +// +// v, err := variant.New(metadataBytes, valueBytes) +// +// You can access the data using the [Value.Value] method which returns the appropriate Go type: +// +// switch v.Type() { +// case variant.Object: +// obj := v.Value().(variant.ObjectValue) +// field, err := obj.ValueByKey("name") +// case variant.Array: +// arr := v.Value().(variant.ArrayValue) +// elem, err := arr.Value(0) +// case variant.String: +// s := v.Value().(string) +// case variant.Int64: +// i := v.Value().(int64) +// } +// +// You can also switch on the type of the result value from the [Value.Value] method: +// +// switch val := v.Value().(type) { +// case nil: +// // ... +// case int32: +// // ... +// case string: +// // ... +// case variant.ArrayValue: +// for i, item := range val.Values() { +// // item is a variant.Value +// } +// case variant.ObjectValue: +// for k, item := range val.Values() { +// // k is the field key +// // item is a variant.Value for that field +// } +// } +// +// Values can also be converted to JSON: +// +// jsonBytes, err := json.Marshal(v) +// +// # Low-level Construction +// +// For direct construction of complex nested structures, you can use the low-level +// methods: +// +// var b variant.Builder +// // Start an object +// start := b.Offset() +// fields := make([]variant.FieldEntry, 0) +// +// // Add a field +// fields = append(fields, b.NextField(start, "key")) +// b.AppendString("value") +// +// // Finish the object +// b.FinishObject(start, fields) +// +// value, err := b.Build() +// +// # Using Struct Tags +// +// When appending Go structs, you can use struct tags to control field names and +// encoding options: +// +// type Person struct { +// ID int `variant:"id"` +// Name string `variant:"name"` +// CreatedAt time.Time `variant:"timestamp,nanos,utc"` +// Internal string `variant:"-"` // Ignored field +// } +// +// # Reusing Builders +// +// When reusing a Builder for multiple values, use Reset() to clear it: +// +// var b variant.Builder +// v1, _ := b.Append(data1).Build() +// v1 = v1.Clone() // Clone before reset if you need to keep the value +// b.Reset() +// v2, _ := b.Append(data2).Build() +package variant diff --git a/parquet/variant/primitive_type_stringer.go b/parquet/variant/primitive_type_stringer.go new file mode 100644 index 00000000..205724c3 --- /dev/null +++ b/parquet/variant/primitive_type_stringer.go @@ -0,0 +1,45 @@ +// Code generated by "stringer -type=PrimitiveType -linecomment -output=primitive_type_stringer.go"; DO NOT EDIT. + +package variant + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[PrimitiveInvalid - -1] + _ = x[PrimitiveNull-0] + _ = x[PrimitiveBoolTrue-1] + _ = x[PrimitiveBoolFalse-2] + _ = x[PrimitiveInt8-3] + _ = x[PrimitiveInt16-4] + _ = x[PrimitiveInt32-5] + _ = x[PrimitiveInt64-6] + _ = x[PrimitiveDouble-7] + _ = x[PrimitiveDecimal4-8] + _ = x[PrimitiveDecimal8-9] + _ = x[PrimitiveDecimal16-10] + _ = x[PrimitiveDate-11] + _ = x[PrimitiveTimestampMicros-12] + _ = x[PrimitiveTimestampMicrosNTZ-13] + _ = x[PrimitiveFloat-14] + _ = x[PrimitiveBinary-15] + _ = x[PrimitiveString-16] + _ = x[PrimitiveTimeMicrosNTZ-17] + _ = x[PrimitiveTimestampNanos-18] + _ = x[PrimitiveTimestampNanosNTZ-19] + _ = x[PrimitiveUUID-20] +} + +const _PrimitiveType_name = "UnknownNullBoolTrueBoolFalseInt8Int16Int32Int64DoubleDecimal32Decimal64Decimal128DateTimestamp(micros)TimestampNTZ(micros)FloatBinaryStringTimeNTZ(micros)Timestamp(nanos)TimestampNTZ(nanos)UUID" + +var _PrimitiveType_index = [...]uint8{0, 7, 11, 19, 28, 32, 37, 42, 47, 53, 62, 71, 81, 85, 102, 122, 127, 133, 139, 154, 170, 189, 193} + +func (i PrimitiveType) String() string { + i -= -1 + if i < 0 || i >= PrimitiveType(len(_PrimitiveType_index)-1) { + return "PrimitiveType(" + strconv.FormatInt(int64(i+-1), 10) + ")" + } + return _PrimitiveType_name[_PrimitiveType_index[i]:_PrimitiveType_index[i+1]] +} diff --git a/parquet/variant/utils.go b/parquet/variant/utils.go new file mode 100644 index 00000000..9b8ca24d --- /dev/null +++ b/parquet/variant/utils.go @@ -0,0 +1,170 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package variant + +import ( + "encoding/binary" + "math" + "unsafe" + + "github.com/apache/arrow-go/v18/arrow/endian" + "github.com/apache/arrow-go/v18/parquet/internal/debug" +) + +func readLEU32(b []byte) uint32 { + debug.Assert(len(b) <= 4, "buffer too large") + debug.Assert(len(b) >= 1, "buffer too small") + + var result uint32 + v := (*[4]byte)(unsafe.Pointer(&result)) + copy(v[:], b) + + return endian.FromLE(result) +} + +func readLEU64(b []byte) uint64 { + debug.Assert(len(b) <= 8, "buffer too large") + debug.Assert(len(b) >= 1, "buffer too small") + + var result uint64 + v := (*[8]byte)(unsafe.Pointer(&result)) + copy(v[:], b) + + return endian.FromLE(result) +} + +func readExact[T int8 | int16 | int32 | int64 | float32 | float64](b []byte) T { + debug.Assert(len(b) >= binary.Size(T(0)), "buffer size mismatch") + var result T + binary.Decode(b, binary.LittleEndian, &result) + return result +} + +func primitiveHeader(t PrimitiveType) byte { + return (byte(t)<<2 | byte(BasicPrimitive)) +} + +func shortStrHeader(sz int) byte { + return byte(sz<<2) | byte(BasicShortString) +} + +func arrayHeader(large bool, offsetSize uint8) byte { + var largeBit byte + if large { + largeBit = 1 + } + + return (largeBit << (basicTypeBits + 2)) | + ((offsetSize - 1) << basicTypeBits) | byte(BasicArray) +} + +func objectHeader(large bool, idSize, offsetSize uint8) byte { + var largeBit byte + if large { + largeBit = 1 + } + + return (largeBit << (basicTypeBits + 4)) | + ((idSize - 1) << (basicTypeBits + 2)) | + ((offsetSize - 1) << basicTypeBits) | byte(BasicObject) +} + +func intSize(v int) uint8 { + debug.Assert(v <= metadataMaxSizeLimit, "size too large") + debug.Assert(v >= 0, "size cannot be negative") + + switch { + case v <= math.MaxUint8: + return 1 + case v <= math.MaxUint16: + return 2 + case v <= 0xFFFFFF: // MaxUint24 + return 3 + default: + return 4 + } +} + +func writeOffset(buf []byte, v int, nbytes uint8) { + debug.Assert(nbytes <= 4, "nbytes size too large") + debug.Assert(nbytes >= 1, "nbytes size too small") + + for i := range nbytes { + buf[i] = byte((v >> (i * 8)) & 0xFF) + } +} + +func valueSize(v []byte) int { + basicType, typeInfo := v[0]&basicTypeMask, (v[0]>>basicTypeBits)&typeInfoMask + switch basicType { + case byte(BasicShortString): + return 1 + int(typeInfo) + case byte(BasicObject): + var szBytes uint8 = 1 + if ((typeInfo >> 4) & 0x1) != 0 { + szBytes = 4 + } + + sz := readLEU32(v[1 : 1+szBytes]) + idSize, offsetSize := ((typeInfo>>2)&0b11)+1, uint32((typeInfo&0b11)+1) + idStart := 1 + szBytes + offsetStart := uint32(idStart) + sz*uint32(idSize) + dataStart := offsetStart + (sz+1)*offsetSize + + idx := offsetStart + sz*uint32(offsetSize) + return int(dataStart + readLEU32(v[idx:idx+offsetSize])) + case byte(BasicArray): + var szBytes uint8 = 1 + if ((typeInfo >> 4) & 0x1) != 0 { + szBytes = 4 + } + + sz := readLEU32(v[1 : 1+szBytes]) + offsetSize, offsetStart := uint32((typeInfo&0b11)+1), uint32(1+szBytes) + dataStart := offsetStart + (sz+1)*offsetSize + + idx := offsetStart + sz*uint32(offsetSize) + return int(dataStart + readLEU32(v[idx:idx+offsetSize])) + default: + switch PrimitiveType(typeInfo) { + case PrimitiveNull, PrimitiveBoolTrue, PrimitiveBoolFalse: + return 1 + case PrimitiveInt8: + return 2 + case PrimitiveInt16: + return 3 + case PrimitiveInt32, PrimitiveDate, PrimitiveFloat, PrimitiveTimeMicrosNTZ: + return 5 + case PrimitiveInt64, PrimitiveDouble, + PrimitiveTimestampMicros, PrimitiveTimestampMicrosNTZ, + PrimitiveTimestampNanos, PrimitiveTimestampNanosNTZ: + return 9 + case PrimitiveDecimal4: + return 6 + case PrimitiveDecimal8: + return 10 + case PrimitiveDecimal16: + return 18 + case PrimitiveBinary, PrimitiveString: + return 5 + int(readLEU32(v[1:5])) + case PrimitiveUUID: + return 17 + default: + panic("unknown primitive type") + } + } +} diff --git a/parquet/variant/variant.go b/parquet/variant/variant.go new file mode 100644 index 00000000..9ba05787 --- /dev/null +++ b/parquet/variant/variant.go @@ -0,0 +1,722 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package variant + +import ( + "bytes" + "encoding/binary" + "encoding/json" + "errors" + "fmt" + "iter" + "maps" + "slices" + "strings" + "time" + "unsafe" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/decimal" + "github.com/apache/arrow-go/v18/arrow/decimal128" + "github.com/apache/arrow-go/v18/parquet/internal/debug" + "github.com/google/uuid" +) + +//go:generate go tool stringer -type=BasicType -linecomment -output=basic_type_stringer.go +//go:generate go tool stringer -type=PrimitiveType -linecomment -output=primitive_type_stringer.go + +// BasicType represents the fundamental type category of a variant value. +type BasicType int + +const ( + BasicUndefined BasicType = iota - 1 // Unknown + BasicPrimitive // Primitive + BasicShortString // ShortString + BasicObject // Object + BasicArray // Array +) + +func basicTypeFromHeader(hdr byte) BasicType { + // because we're doing hdr & 0x3, it is impossible for the result + // to be outside of the range of BasicType. Therefore, we don't + // need to perform any checks. The value will always be [0,3] + return BasicType(hdr & basicTypeMask) +} + +// PrimitiveType represents specific primitive data types within the variant format. +type PrimitiveType int + +const ( + PrimitiveInvalid PrimitiveType = iota - 1 // Unknown + PrimitiveNull // Null + PrimitiveBoolTrue // BoolTrue + PrimitiveBoolFalse // BoolFalse + PrimitiveInt8 // Int8 + PrimitiveInt16 // Int16 + PrimitiveInt32 // Int32 + PrimitiveInt64 // Int64 + PrimitiveDouble // Double + PrimitiveDecimal4 // Decimal32 + PrimitiveDecimal8 // Decimal64 + PrimitiveDecimal16 // Decimal128 + PrimitiveDate // Date + PrimitiveTimestampMicros // Timestamp(micros) + PrimitiveTimestampMicrosNTZ // TimestampNTZ(micros) + PrimitiveFloat // Float + PrimitiveBinary // Binary + PrimitiveString // String + PrimitiveTimeMicrosNTZ // TimeNTZ(micros) + PrimitiveTimestampNanos // Timestamp(nanos) + PrimitiveTimestampNanosNTZ // TimestampNTZ(nanos) + PrimitiveUUID // UUID +) + +func primitiveTypeFromHeader(hdr byte) PrimitiveType { + return PrimitiveType((hdr >> basicTypeBits) & typeInfoMask) +} + +// Type represents the high-level variant data type. +// This is what applications typically use to identify the type of a variant value. +type Type int + +const ( + Object Type = iota + Array + Null + Bool + Int8 + Int16 + Int32 + Int64 + String + Double + Decimal4 + Decimal8 + Decimal16 + Date + TimestampMicros + TimestampMicrosNTZ + Float + Binary + Time + TimestampNanos + TimestampNanosNTZ + UUID +) + +const ( + versionMask uint8 = 0x0F + sortedStrMask uint8 = 0b10000 + basicTypeMask uint8 = 0x3 + basicTypeBits uint8 = 2 + typeInfoMask uint8 = 0x3F + hdrSizeBytes = 1 + minOffsetSizeBytes = 1 + maxOffsetSizeBytes = 4 + + // mask is applied after shift + offsetSizeMask uint8 = 0b11 + offsetSizeBitShift uint8 = 6 + supportedVersion = 1 + maxShortStringSize = 0x3F + metadataMaxSizeLimit = 128 * 1024 * 1024 // 128MB +) + +var ( + // EmptyMetadataBytes contains a minimal valid metadata section with no dictionary entries. + EmptyMetadataBytes = [3]byte{0x1, 0, 0} + + ErrInvalidMetadata = errors.New("invalid variant metadata") +) + +// Metadata represents the dictionary part of a variant value, which stores +// the keys used in object values. +type Metadata struct { + data []byte + keys [][]byte +} + +// NewMetadata creates a Metadata instance from a raw byte slice. +// It validates the metadata format and loads the key dictionary. +func NewMetadata(data []byte) (Metadata, error) { + m := Metadata{data: data} + if len(data) < hdrSizeBytes+minOffsetSizeBytes*2 { + return m, fmt.Errorf("%w: too short: size=%d", ErrInvalidMetadata, len(data)) + } + + if m.Version() != supportedVersion { + return m, fmt.Errorf("%w: unsupported version: %d", ErrInvalidMetadata, m.Version()) + } + + offsetSz := m.OffsetSize() + return m, m.loadDictionary(offsetSz) +} + +// Clone creates a deep copy of the metadata. +func (m *Metadata) Clone() Metadata { + return Metadata{ + data: bytes.Clone(m.data), + // shallow copy of the values, but the slice is copied + // more efficient, and nothing should be mutating the keys + // so it's probably safe, but something we should keep in mind + keys: slices.Clone(m.keys), + } +} + +func (m *Metadata) loadDictionary(offsetSz uint8) error { + if int(offsetSz+hdrSizeBytes) > len(m.data) { + return fmt.Errorf("%w: too short for dictionary size", ErrInvalidMetadata) + } + + dictSize := readLEU32(m.data[hdrSizeBytes : hdrSizeBytes+offsetSz]) + m.keys = make([][]byte, dictSize) + + if dictSize == 0 { + return nil + } + + // first offset is always 0 + offsetStart, offsetPos := uint32(0), hdrSizeBytes+offsetSz + valuesStart := hdrSizeBytes + (dictSize+2)*uint32(offsetSz) + if hdrSizeBytes+int(dictSize+1)*int(offsetSz) > len(m.data) { + return fmt.Errorf("%w: offset out of range: %d > %d", + ErrInvalidMetadata, (dictSize+hdrSizeBytes)*uint32(offsetSz), len(m.data)) + } + + for i := range dictSize { + offsetPos += offsetSz + end := readLEU32(m.data[offsetPos : offsetPos+offsetSz]) + + keySize := end - offsetStart + valStart := valuesStart + offsetStart + if valStart+keySize > uint32(len(m.data)) { + return fmt.Errorf("%w: string data out of range: %d + %d > %d", + ErrInvalidMetadata, valStart, keySize, len(m.data)) + } + m.keys[i] = m.data[valStart : valStart+keySize] + offsetStart += keySize + } + + return nil +} + +// Bytes returns the raw byte representation of the metadata. +func (m Metadata) Bytes() []byte { return m.data } + +// Version returns the metadata format version. +func (m Metadata) Version() uint8 { return m.data[0] & versionMask } + +// SortedAndUnique returns whether the keys in the metadata dictionary are sorted and unique. +func (m Metadata) SortedAndUnique() bool { return m.data[0]&sortedStrMask != 0 } + +// OffsetSize returns the size in bytes used to store offsets in the metadata. +func (m Metadata) OffsetSize() uint8 { + return ((m.data[0] >> offsetSizeBitShift) & offsetSizeMask) + 1 +} + +// DictionarySize returns the number of keys in the metadata dictionary. +func (m Metadata) DictionarySize() uint32 { return uint32(len(m.keys)) } + +// KeyAt returns the string key at the given dictionary ID. +// Returns an error if the ID is out of range. +func (m Metadata) KeyAt(id uint32) (string, error) { + if id >= uint32(len(m.keys)) { + return "", fmt.Errorf("invalid variant metadata: id out of range: %d >= %d", + id, len(m.keys)) + } + + return unsafe.String(&m.keys[id][0], len(m.keys[id])), nil +} + +// IdFor returns the dictionary IDs for the given key. +// If the metadata is sorted and unique, this performs a binary search. +// Otherwise, it performs a linear search. +// +// If the metadata is not sorted and unique, then it's possible that multiple +// IDs will be returned for the same key. +func (m Metadata) IdFor(key string) []uint32 { + k := unsafe.Slice(unsafe.StringData(key), len(key)) + + var ret []uint32 + if m.SortedAndUnique() { + idx, found := slices.BinarySearchFunc(m.keys, k, bytes.Compare) + if found { + ret = append(ret, uint32(idx)) + } + + return ret + } + + for i, kb := range m.keys { + if bytes.Equal(kb, k) { + ret = append(ret, uint32(i)) + } + } + + return ret +} + +// DecimalValue represents a decimal number with a specified scale. +// The generic parameter T can be any supported variant decimal type (Decimal32, Decimal64, Decimal128). +type DecimalValue[T decimal.DecimalTypes] struct { + Scale uint8 + Value decimal.Num[T] +} + +// MarshalJSON implements the json.Marshaler interface for DecimalValue. +func (v DecimalValue[T]) MarshalJSON() ([]byte, error) { + return []byte(v.Value.ToString(int32(v.Scale))), nil +} + +// ArrayValue represents an array of variant values. +type ArrayValue struct { + value []byte + meta Metadata + + numElements uint32 + dataStart uint32 + offsetSize uint8 + offsetStart uint8 +} + +// MarshalJSON implements the json.Marshaler interface for ArrayValue. +func (v ArrayValue) MarshalJSON() ([]byte, error) { + return json.Marshal(slices.Collect(v.Values())) +} + +// Len returns the number of elements in the array. +func (v ArrayValue) Len() uint32 { return v.numElements } + +// Values returns an iterator for the elements in the array, allowing +// for lazy evaluation of the offsets (for the situation where not all elements +// are iterated). +func (v ArrayValue) Values() iter.Seq[Value] { + return func(yield func(Value) bool) { + for i := range v.numElements { + idx := uint32(v.offsetStart) + i*uint32(v.offsetSize) + offset := readLEU32(v.value[idx : idx+uint32(v.offsetSize)]) + if !yield(Value{value: v.value[v.dataStart+offset:], meta: v.meta}) { + return + } + } + } +} + +// Value returns the Value at the specified index. +// Returns an error if the index is out of range. +func (v ArrayValue) Value(i uint32) (Value, error) { + if i >= v.numElements { + return Value{}, fmt.Errorf("%w: invalid array value: index out of range: %d >= %d", + arrow.ErrIndex, i, v.numElements) + } + + idx := uint32(v.offsetStart) + i*uint32(v.offsetSize) + offset := readLEU32(v.value[idx : idx+uint32(v.offsetSize)]) + + return Value{meta: v.meta, value: v.value[v.dataStart+offset:]}, nil +} + +// ObjectValue represents an object (map/dictionary) of key-value pairs. +type ObjectValue struct { + value []byte + meta Metadata + + numElements uint32 + offsetStart uint32 + dataStart uint32 + idSize uint8 + offsetSize uint8 + idStart uint8 +} + +// ObjectField represents a key-value pair in an object. +type ObjectField struct { + Key string + Value Value +} + +// NumElements returns the number of fields in the object. +func (v ObjectValue) NumElements() uint32 { return v.numElements } + +// ValueByKey returns the field with the specified key. +// Returns arrow.ErrNotFound if the key doesn't exist. +func (v ObjectValue) ValueByKey(key string) (ObjectField, error) { + n := v.numElements + + // if total list size is smaller than threshold, linear search will + // likely be faster than a binary search + const binarySearchThreshold = 32 + if n < binarySearchThreshold { + for i := range n { + idx := uint32(v.idStart) + i*uint32(v.idSize) + id := readLEU32(v.value[idx : idx+uint32(v.idSize)]) + k, err := v.meta.KeyAt(id) + if err != nil { + return ObjectField{}, fmt.Errorf("invalid object value: fieldID at idx %d is not in metadata", idx) + } + if k == key { + idx := uint32(v.offsetStart) + uint32(v.offsetSize)*i + offset := readLEU32(v.value[idx : idx+uint32(v.offsetSize)]) + return ObjectField{ + Key: key, + Value: Value{value: v.value[v.dataStart+offset:], meta: v.meta}}, nil + } + } + return ObjectField{}, arrow.ErrNotFound + } + + i, j := uint32(0), n + for i < j { + mid := (i + j) >> 1 + idx := uint32(v.idStart) + mid*uint32(v.idSize) + id := readLEU32(v.value[idx : idx+uint32(v.idSize)]) + k, err := v.meta.KeyAt(id) + if err != nil { + return ObjectField{}, fmt.Errorf("invalid object value: fieldID at idx %d is not in metadata", idx) + } + + switch strings.Compare(k, key) { + case -1: + i = mid + 1 + case 0: + idx := uint32(v.offsetStart) + uint32(v.offsetSize)*mid + offset := readLEU32(v.value[idx : idx+uint32(v.offsetSize)]) + + return ObjectField{ + Key: key, + Value: Value{value: v.value[v.dataStart+offset:], meta: v.meta}}, nil + case 1: + j = mid - 1 + } + } + + return ObjectField{}, arrow.ErrNotFound +} + +// FieldAt returns the field at the specified index. +// Returns an error if the index is out of range. +func (v ObjectValue) FieldAt(i uint32) (ObjectField, error) { + if i >= v.numElements { + return ObjectField{}, fmt.Errorf("%w: invalid object value: index out of range: %d >= %d", + arrow.ErrIndex, i, v.numElements) + } + + idx := uint32(v.idStart) + i*uint32(v.idSize) + id := readLEU32(v.value[idx : idx+uint32(v.idSize)]) + k, err := v.meta.KeyAt(id) + if err != nil { + return ObjectField{}, fmt.Errorf("invalid object value: fieldID at idx %d is not in metadata", idx) + } + + offsetIdx := uint32(v.offsetStart) + i*uint32(v.offsetSize) + offset := readLEU32(v.value[offsetIdx : offsetIdx+uint32(v.offsetSize)]) + + return ObjectField{ + Key: k, + Value: Value{value: v.value[v.dataStart+offset:], meta: v.meta}}, nil +} + +// Values returns an iterator over all key-value pairs in the object. +func (v ObjectValue) Values() iter.Seq2[string, Value] { + return func(yield func(string, Value) bool) { + for i := range v.numElements { + idx := uint32(v.idStart) + i*uint32(v.idSize) + id := readLEU32(v.value[idx : idx+uint32(v.idSize)]) + k, err := v.meta.KeyAt(id) + if err != nil { + return + } + + offsetIdx := uint32(v.offsetStart) + i*uint32(v.offsetSize) + offset := readLEU32(v.value[offsetIdx : offsetIdx+uint32(v.offsetSize)]) + + if !yield(k, Value{value: v.value[v.dataStart+offset:], meta: v.meta}) { + return + } + } + } +} + +// MarshalJSON implements the json.Marshaler interface for ObjectValue. +func (v ObjectValue) MarshalJSON() ([]byte, error) { + // for now we'll use a naive approach and just build a map + // then marshal it. This is not the most efficient way to do this + // but it is the simplest and most straightforward. + mapping := make(map[string]Value) + maps.Insert(mapping, v.Values()) + return json.Marshal(mapping) +} + +// Value represents a variant value of any type. +type Value struct { + value []byte + meta Metadata +} + +// NewWithMetadata creates a Value with the provided metadata and value bytes. +func NewWithMetadata(meta Metadata, value []byte) (Value, error) { + if len(value) == 0 { + return Value{}, errors.New("invalid variant value: empty") + } + + return Value{value: value, meta: meta}, nil +} + +// New creates a Value by parsing both the metadata and value bytes. +func New(meta, value []byte) (Value, error) { + m, err := NewMetadata(meta) + if err != nil { + return Value{}, err + } + + return NewWithMetadata(m, value) +} + +// Bytes returns the raw byte representation of the value (excluding metadata). +func (v Value) Bytes() []byte { return v.value } + +// Clone creates a deep copy of the value including its metadata. +func (v Value) Clone() Value { + return Value{ + meta: v.meta.Clone(), + value: bytes.Clone(v.value), + } +} + +// Metadata returns the metadata associated with the value. +func (v Value) Metadata() Metadata { return v.meta } + +// BasicType returns the fundamental type category of the value. +func (v Value) BasicType() BasicType { + return basicTypeFromHeader(v.value[0]) +} + +// Type returns the specific data type of the value. +func (v Value) Type() Type { + switch t := v.BasicType(); t { + case BasicPrimitive: + switch primType := primitiveTypeFromHeader(v.value[0]); primType { + case PrimitiveNull: + return Null + case PrimitiveBoolTrue, PrimitiveBoolFalse: + return Bool + case PrimitiveInt8: + return Int8 + case PrimitiveInt16: + return Int16 + case PrimitiveInt32: + return Int32 + case PrimitiveInt64: + return Int64 + case PrimitiveDouble: + return Double + case PrimitiveDecimal4: + return Decimal4 + case PrimitiveDecimal8: + return Decimal8 + case PrimitiveDecimal16: + return Decimal16 + case PrimitiveDate: + return Date + case PrimitiveTimestampMicros: + return TimestampMicros + case PrimitiveTimestampMicrosNTZ: + return TimestampMicrosNTZ + case PrimitiveFloat: + return Float + case PrimitiveBinary: + return Binary + case PrimitiveString: + return String + case PrimitiveTimeMicrosNTZ: + return Time + case PrimitiveTimestampNanos: + return TimestampNanos + case PrimitiveTimestampNanosNTZ: + return TimestampNanosNTZ + case PrimitiveUUID: + return UUID + default: + panic(fmt.Errorf("invalid primitive type found: %d", primType)) + } + case BasicShortString: + return String + case BasicObject: + return Object + case BasicArray: + return Array + default: + panic(fmt.Errorf("invalid basic type found: %d", t)) + } +} + +// Value returns the Go value representation of the variant. +// The returned type depends on the variant type: +// - Null: nil +// - Bool: bool +// - Int8/16/32/64: corresponding int type +// - Float/Double: float32/float64 +// - String: string +// - Binary: []byte +// - Decimal: DecimalValue +// - Date: arrow.Date32 +// - Time: arrow.Time64 +// - Timestamp: arrow.Timestamp +// - UUID: uuid.UUID +// - Object: ObjectValue +// - Array: ArrayValue +func (v Value) Value() any { + switch t := v.BasicType(); t { + case BasicPrimitive: + switch primType := primitiveTypeFromHeader(v.value[0]); primType { + case PrimitiveNull: + return nil + case PrimitiveBoolTrue: + return true + case PrimitiveBoolFalse: + return false + case PrimitiveInt8: + return readExact[int8](v.value[1:]) + case PrimitiveInt16: + return readExact[int16](v.value[1:]) + case PrimitiveInt32: + return readExact[int32](v.value[1:]) + case PrimitiveInt64: + return readExact[int64](v.value[1:]) + case PrimitiveDouble: + return readExact[float64](v.value[1:]) + case PrimitiveFloat: + return readExact[float32](v.value[1:]) + case PrimitiveDate: + return arrow.Date32(readExact[int32](v.value[1:])) + case PrimitiveTimestampMicros, PrimitiveTimestampMicrosNTZ, + PrimitiveTimestampNanos, PrimitiveTimestampNanosNTZ: + return arrow.Timestamp(readExact[int64](v.value[1:])) + case PrimitiveTimeMicrosNTZ: + return arrow.Time64(readExact[int64](v.value[1:])) + case PrimitiveUUID: + debug.Assert(len(v.value[1:]) == 16, "invalid UUID length") + return uuid.Must(uuid.FromBytes(v.value[1:])) + case PrimitiveBinary: + sz := binary.LittleEndian.Uint32(v.value[1:5]) + return v.value[5 : 5+sz] + case PrimitiveString: + sz := binary.LittleEndian.Uint32(v.value[1:5]) + return unsafe.String(&v.value[5], sz) + case PrimitiveDecimal4: + scale := uint8(v.value[1]) + val := decimal.Decimal32(readExact[int32](v.value[2:])) + return DecimalValue[decimal.Decimal32]{Scale: scale, Value: val} + case PrimitiveDecimal8: + scale := uint8(v.value[1]) + val := decimal.Decimal64(readExact[int64](v.value[2:])) + return DecimalValue[decimal.Decimal64]{Scale: scale, Value: val} + case PrimitiveDecimal16: + scale := uint8(v.value[1]) + lowBits := readLEU64(v.value[2:10]) + highBits := readExact[int64](v.value[10:]) + return DecimalValue[decimal.Decimal128]{ + Scale: scale, + Value: decimal128.New(highBits, lowBits), + } + } + case BasicShortString: + sz := int(v.value[0] >> 2) + return unsafe.String(&v.value[1], sz) + case BasicObject: + valueHdr := (v.value[0] >> basicTypeBits) + fieldOffsetSz := (valueHdr & 0b11) + 1 + fieldIdSz := ((valueHdr >> 2) & 0b11) + 1 + isLarge := ((valueHdr >> 4) & 0b1) == 1 + + var nelemSize uint8 = 1 + if isLarge { + nelemSize = 4 + } + + debug.Assert(len(v.value) >= int(1+nelemSize), "invalid object value: too short") + numElements := readLEU32(v.value[1 : 1+nelemSize]) + idStart := uint32(1 + nelemSize) + offsetStart := idStart + numElements*uint32(fieldIdSz) + dataStart := offsetStart + (numElements+1)*uint32(fieldOffsetSz) + + debug.Assert(dataStart <= uint32(len(v.value)), "invalid object value: dataStart out of range") + return ObjectValue{ + value: v.value, + meta: v.meta, + numElements: numElements, + offsetStart: offsetStart, + dataStart: dataStart, + idSize: fieldIdSz, + offsetSize: fieldOffsetSz, + idStart: uint8(idStart), + } + case BasicArray: + valueHdr := (v.value[0] >> basicTypeBits) + fieldOffsetSz := (valueHdr & 0b11) + 1 + isLarge := (valueHdr & 0b1) == 1 + + var ( + sz int + offsetStart, dataStart int + ) + + if isLarge { + sz, offsetStart = int(readLEU32(v.value[1:5])), 5 + } else { + sz, offsetStart = int(v.value[1]), 2 + } + + dataStart = offsetStart + (sz+1)*int(fieldOffsetSz) + debug.Assert(dataStart <= len(v.value), "invalid array value: dataStart out of range") + return ArrayValue{ + value: v.value, + meta: v.meta, + numElements: uint32(sz), + dataStart: uint32(dataStart), + offsetSize: fieldOffsetSz, + offsetStart: uint8(offsetStart), + } + } + + debug.Assert(false, "unsupported type") + return nil +} + +// MarshalJSON implements the json.Marshaler interface for Value. +func (v Value) MarshalJSON() ([]byte, error) { + result := v.Value() + switch t := result.(type) { + case arrow.Date32: + result = t.FormattedString() + case arrow.Timestamp: + switch primType := primitiveTypeFromHeader(v.value[0]); primType { + case PrimitiveTimestampMicros: + result = t.ToTime(arrow.Microsecond).Format("2006-01-02 15:04:05.999999Z0700") + case PrimitiveTimestampMicrosNTZ: + result = t.ToTime(arrow.Microsecond).In(time.Local).Format("2006-01-02 15:04:05.999999Z0700") + case PrimitiveTimestampNanos: + result = t.ToTime(arrow.Nanosecond).Format("2006-01-02 15:04:05.999999999Z0700") + case PrimitiveTimestampNanosNTZ: + result = t.ToTime(arrow.Nanosecond).In(time.Local).Format("2006-01-02 15:04:05.999999999Z0700") + } + case arrow.Time32: + result = t.ToTime(arrow.Microsecond).In(time.Local).Format("15:04:05.999999Z0700") + } + + return json.Marshal(result) +} diff --git a/parquet/variant/variant_test.go b/parquet/variant/variant_test.go new file mode 100644 index 00000000..2ef4da38 --- /dev/null +++ b/parquet/variant/variant_test.go @@ -0,0 +1,787 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package variant_test + +import ( + "encoding/json" + "math" + "os" + "path/filepath" + "testing" + "time" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/decimal" + "github.com/apache/arrow-go/v18/arrow/decimal128" + "github.com/apache/arrow-go/v18/parquet/variant" + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func getVariantDir() string { + variantDir := os.Getenv("PARQUET_TEST_DATA") + if variantDir == "" { + return "" + } + + return filepath.Join(variantDir, "..", "variant") +} + +func metadataTestFilename(test string) string { + return test + ".metadata" +} + +func valueTestFilename(test string) string { + return test + ".value" +} + +func TestBasicRead(t *testing.T) { + dir := getVariantDir() + if dir == "" { + t.Skip("PARQUET_TEST_DATA not set") + } + + tests := []string{ + // FIXME: null metadata is corrupt, see + // https://github.com/apache/parquet-testing/issues/81 + // "primitive_null.metadata", + "primitive_boolean_true.metadata", + "primitive_boolean_false.metadata", + "primitive_int8.metadata", + "primitive_int16.metadata", + "primitive_int32.metadata", + "primitive_int64.metadata", + "primitive_float.metadata", + "primitive_double.metadata", + "primitive_string.metadata", + "primitive_binary.metadata", + "primitive_date.metadata", + "primitive_decimal4.metadata", + "primitive_decimal8.metadata", + "primitive_decimal16.metadata", + "primitive_timestamp.metadata", + "primitive_timestampntz.metadata", + } + + for _, test := range tests { + t.Run(test, func(t *testing.T) { + fname := filepath.Join(dir, test) + require.FileExists(t, fname, "file %s does not exist", fname) + + metadata, err := os.ReadFile(fname) + require.NoError(t, err) + + m, err := variant.NewMetadata(metadata) + require.NoError(t, err) + assert.EqualValues(t, 1, m.Version()) + _, err = m.KeyAt(0) + assert.Error(t, err) + }) + } + + t.Run("object_primitive.metadata", func(t *testing.T) { + fname := filepath.Join(dir, "object_primitive.metadata") + require.FileExists(t, fname, "file %s does not exist", fname) + + metadata, err := os.ReadFile(fname) + require.NoError(t, err) + + m, err := variant.NewMetadata(metadata) + require.NoError(t, err) + assert.EqualValues(t, 1, m.Version()) + + keys := []string{ + "int_field", "double_field", "boolean_true_field", + "boolean_false_field", "string_field", "null_field", + "timestamp_field", + } + + for i, k := range keys { + key, err := m.KeyAt(uint32(i)) + require.NoError(t, err) + assert.Equal(t, k, key) + assert.Equal(t, uint32(i), m.IdFor(k)[0]) + } + }) +} + +func loadVariant(t *testing.T, test string) variant.Value { + dir := getVariantDir() + if dir == "" { + t.Skip("PARQUET_TEST_DATA not set") + } + + fname := filepath.Join(dir, test) + metadataPath := metadataTestFilename(fname) + valuePath := valueTestFilename(fname) + + metaBytes, err := os.ReadFile(metadataPath) + require.NoError(t, err) + valueBytes, err := os.ReadFile(valuePath) + require.NoError(t, err) + + v, err := variant.New(metaBytes, valueBytes) + require.NoError(t, err) + return v +} + +func TestPrimitiveVariants(t *testing.T) { + tests := []struct { + name string + expected any + variantType variant.Type + jsonStr string + }{ + {"primitive_boolean_true", true, variant.Bool, "true"}, + {"primitive_boolean_false", false, variant.Bool, "false"}, + {"primitive_int8", int8(42), variant.Int8, "42"}, + {"primitive_int16", int16(1234), variant.Int16, "1234"}, + {"primitive_int32", int32(123456), variant.Int32, "123456"}, + // FIXME: https://github.com/apache/parquet-testing/issues/82 + // primitive_int64 is an int32 value, but the metadata is int64 + {"primitive_int64", int32(12345678), variant.Int32, "12345678"}, + {"primitive_float", float32(1234567940.0), variant.Float, "1234568000"}, + {"primitive_double", float64(1234567890.1234), variant.Double, "1234567890.1234"}, + {"primitive_string", + `This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!`, + variant.String, `"This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!"`}, + {"short_string", `Less than 64 bytes (❤️ with utf8)`, variant.String, `"Less than 64 bytes (❤️ with utf8)"`}, + // 031337deadbeefcafe + {"primitive_binary", []byte{0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe}, variant.Binary, `"AxM33q2+78r+"`}, + {"primitive_decimal4", variant.DecimalValue[decimal.Decimal32]{ + Scale: 2, + Value: decimal.Decimal32(1234), + }, variant.Decimal4, `12.34`}, + {"primitive_decimal8", variant.DecimalValue[decimal.Decimal64]{ + Scale: 2, + Value: decimal.Decimal64(1234567890), + }, variant.Decimal8, `12345678.90`}, + {"primitive_decimal16", variant.DecimalValue[decimal.Decimal128]{ + Scale: 2, + Value: decimal128.FromU64(1234567891234567890), + }, variant.Decimal16, `12345678912345678.90`}, + // // 2025-04-16 + {"primitive_date", arrow.Date32(20194), variant.Date, `"2025-04-16"`}, + {"primitive_timestamp", arrow.Timestamp(1744821296780000), variant.TimestampMicros, `"2025-04-16 16:34:56.78Z"`}, + {"primitive_timestampntz", arrow.Timestamp(1744806896780000), variant.TimestampMicrosNTZ, `"` + time.UnixMicro(1744806896780000).UTC().In(time.Local).Format("2006-01-02 15:04:05.999999Z0700") + `"`}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + v := loadVariant(t, tt.name) + assert.Equal(t, tt.expected, v.Value()) + assert.Equal(t, tt.variantType, v.Type()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.Equal(t, tt.jsonStr, string(out)) + }) + } +} + +func primitiveHeader(p variant.PrimitiveType) uint8 { + return (uint8(p) << 2) +} + +func TestNullValue(t *testing.T) { + emptyMeta := variant.EmptyMetadataBytes + nullChars := []byte{primitiveHeader(variant.PrimitiveNull)} + + v, err := variant.New(emptyMeta[:], nullChars) + require.NoError(t, err) + + assert.Equal(t, variant.Null, v.Type()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.Equal(t, "null", string(out)) +} + +func TestSimpleInt64(t *testing.T) { + metaBytes := variant.EmptyMetadataBytes[:] + + int64Bytes := []byte{primitiveHeader(variant.PrimitiveInt64), + 0xB1, 0x1C, 0x6C, 0xB1, 0xF4, 0x10, 0x22, 0x11} + + v, err := variant.New(metaBytes, int64Bytes) + require.NoError(t, err) + + assert.Equal(t, variant.Int64, v.Type()) + assert.Equal(t, int64(1234567890987654321), v.Value()) + + negInt64Bytes := []byte{primitiveHeader(variant.PrimitiveInt64), + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF} + + v, err = variant.New(metaBytes, negInt64Bytes) + require.NoError(t, err) + + assert.Equal(t, variant.Int64, v.Type()) + assert.Equal(t, int64(-1), v.Value()) +} + +func TestObjectValues(t *testing.T) { + v := loadVariant(t, "object_primitive") + assert.Equal(t, variant.Object, v.Type()) + + obj := v.Value().(variant.ObjectValue) + assert.EqualValues(t, 7, obj.NumElements()) + + tests := []struct { + field string + expected any + typ variant.Type + }{ + {"int_field", int8(1), variant.Int8}, + {"double_field", variant.DecimalValue[decimal.Decimal32]{ + Scale: 8, Value: decimal.Decimal32(123456789)}, variant.Decimal4}, + {"boolean_true_field", true, variant.Bool}, + {"boolean_false_field", false, variant.Bool}, + {"string_field", "Apache Parquet", variant.String}, + {"null_field", nil, variant.Null}, + {"timestamp_field", "2025-04-16T12:34:56.78", variant.String}, + } + + for _, tt := range tests { + t.Run(tt.field, func(t *testing.T) { + v, err := obj.ValueByKey(tt.field) + require.NoError(t, err) + + assert.Equal(t, tt.typ, v.Value.Type()) + assert.Equal(t, tt.expected, v.Value.Value()) + }) + } + + t.Run("json", func(t *testing.T) { + out, err := json.Marshal(v) + require.NoError(t, err) + + expected := `{ + "boolean_false_field":false, + "boolean_true_field":true, + "double_field":1.23456789, + "int_field":1, + "null_field":null, + "string_field":"Apache Parquet", + "timestamp_field":"2025-04-16T12:34:56.78"}` + + assert.JSONEq(t, expected, string(out)) + }) + + t.Run("invalid_key", func(t *testing.T) { + v, err := obj.ValueByKey("invalid_key") + require.ErrorIs(t, err, arrow.ErrNotFound) + assert.Zero(t, v) + }) + + t.Run("field by index", func(t *testing.T) { + fieldOrder := []string{ + "boolean_false_field", + "boolean_true_field", + "double_field", + "int_field", + "null_field", + "string_field", + "timestamp_field", + } + + for i := range obj.NumElements() { + val, err := obj.FieldAt(i) + require.NoError(t, err) + + assert.Equal(t, fieldOrder[i], val.Key) + } + }) +} + +func TestNestedObjectValues(t *testing.T) { + v := loadVariant(t, "object_nested") + assert.Equal(t, variant.Object, v.Type()) + obj := v.Value().(variant.ObjectValue) + assert.EqualValues(t, 3, obj.NumElements()) + + // trying to get the exists key + id, err := obj.ValueByKey("id") + require.NoError(t, err) + assert.Equal(t, variant.Int8, id.Value.Type()) + assert.Equal(t, int8(1), id.Value.Value()) + + observation, err := obj.ValueByKey("observation") + require.NoError(t, err) + assert.Equal(t, variant.Object, observation.Value.Type()) + + species, err := obj.ValueByKey("species") + require.NoError(t, err) + assert.Equal(t, variant.Object, species.Value.Type()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `{ + "id": 1, + "observation": { + "location": "In the Volcano", + "time": "12:34:56", + "value": { + "humidity": 456, + "temperature": 123 + } + }, + "species": { + "name": "lava monster", + "population": 6789 + } + }`, string(out)) + + t.Run("inner object", func(t *testing.T) { + speciesObj := species.Value.Value().(variant.ObjectValue) + assert.EqualValues(t, 2, speciesObj.NumElements()) + + name, err := speciesObj.ValueByKey("name") + require.NoError(t, err) + assert.Equal(t, variant.String, name.Value.Type()) + assert.Equal(t, "lava monster", name.Value.Value()) + + population, err := speciesObj.ValueByKey("population") + require.NoError(t, err) + assert.Equal(t, variant.Int16, population.Value.Type()) + assert.Equal(t, int16(6789), population.Value.Value()) + }) + + t.Run("inner key outside", func(t *testing.T) { + // only observation should successfully retrieve key + observationKeys := []string{"location", "time", "value"} + observationObj := observation.Value.Value().(variant.ObjectValue) + speciesObj := species.Value.Value().(variant.ObjectValue) + for _, k := range observationKeys { + inner, err := observationObj.ValueByKey(k) + require.NoError(t, err) + assert.Equal(t, k, inner.Key) + + _, err = obj.ValueByKey(k) + require.ErrorIs(t, err, arrow.ErrNotFound) + + _, err = speciesObj.ValueByKey(k) + require.ErrorIs(t, err, arrow.ErrNotFound) + } + }) +} + +func TestUUID(t *testing.T) { + emptyMeta := variant.EmptyMetadataBytes[:] + uuidBytes := []byte{primitiveHeader(variant.PrimitiveUUID), + 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, + 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF} + + v, err := variant.New(emptyMeta, uuidBytes) + require.NoError(t, err) + assert.Equal(t, variant.UUID, v.Type()) + assert.Equal(t, uuid.MustParse("00112233-4455-6677-8899-aabbccddeeff"), v.Value()) +} + +func TestTimestampNanos(t *testing.T) { + emptyMeta := variant.EmptyMetadataBytes[:] + + t.Run("ts nanos tz negative", func(t *testing.T) { + data := []byte{primitiveHeader(variant.PrimitiveTimestampNanos), + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF} + v, err := variant.New(emptyMeta, data) + require.NoError(t, err) + assert.Equal(t, variant.TimestampNanos, v.Type()) + assert.Equal(t, arrow.Timestamp(-1), v.Value()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `"1969-12-31 23:59:59.999999999Z"`, string(out)) + }) + + t.Run("ts nanos tz positive", func(t *testing.T) { + data := []byte{primitiveHeader(variant.PrimitiveTimestampNanos), + 0x15, 0xC9, 0xBB, 0x86, 0xB4, 0x0C, 0x37, 0x18} + v, err := variant.New(emptyMeta, data) + require.NoError(t, err) + assert.Equal(t, variant.TimestampNanos, v.Type()) + assert.Equal(t, arrow.Timestamp(1744877350123456789), v.Value()) + + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, `"2025-04-17 08:09:10.123456789Z"`, string(out)) + }) + + t.Run("ts nanos ntz positive", func(t *testing.T) { + data := []byte{primitiveHeader(variant.PrimitiveTimestampNanosNTZ), + 0x15, 0xC9, 0xBB, 0x86, 0xB4, 0x0C, 0x37, 0x18} + v, err := variant.New(emptyMeta, data) + require.NoError(t, err) + assert.Equal(t, variant.TimestampNanosNTZ, v.Type()) + assert.Equal(t, arrow.Timestamp(1744877350123456789), v.Value()) + + tm := time.Unix(1744877350123456789/int64(time.Second), 1744877350123456789%int64(time.Second)) + tm = tm.In(time.Local) + out, err := json.Marshal(v) + require.NoError(t, err) + assert.JSONEq(t, tm.Format(`"2006-01-02 15:04:05.999999999Z0700"`), string(out)) + }) +} + +func TestArrayValues(t *testing.T) { + t.Run("array primitive", func(t *testing.T) { + v := loadVariant(t, "array_primitive") + assert.Equal(t, variant.Array, v.Type()) + + arr := v.Value().(variant.ArrayValue) + assert.EqualValues(t, 4, arr.Len()) + + elem0, err := arr.Value(0) + require.NoError(t, err) + assert.Equal(t, variant.Int8, elem0.Type()) + assert.Equal(t, int8(2), elem0.Value()) + + elem1, err := arr.Value(1) + require.NoError(t, err) + assert.Equal(t, variant.Int8, elem1.Type()) + assert.Equal(t, int8(1), elem1.Value()) + + elem2, err := arr.Value(2) + require.NoError(t, err) + assert.Equal(t, variant.Int8, elem2.Type()) + assert.Equal(t, int8(5), elem2.Value()) + + elem3, err := arr.Value(3) + require.NoError(t, err) + assert.Equal(t, variant.Int8, elem3.Type()) + assert.Equal(t, int8(9), elem3.Value()) + + _, err = arr.Value(4) + require.ErrorIs(t, err, arrow.ErrIndex) + + out, err := json.Marshal(v) + require.NoError(t, err) + expected := `[2,1,5,9]` + assert.JSONEq(t, expected, string(out)) + }) + + t.Run("empty array", func(t *testing.T) { + v := loadVariant(t, "array_empty") + assert.Equal(t, variant.Array, v.Type()) + + arr := v.Value().(variant.ArrayValue) + assert.EqualValues(t, 0, arr.Len()) + _, err := arr.Value(0) + require.ErrorIs(t, err, arrow.ErrIndex) + }) + + t.Run("array nested", func(t *testing.T) { + v := loadVariant(t, "array_nested") + assert.Equal(t, variant.Array, v.Type()) + + arr := v.Value().(variant.ArrayValue) + assert.EqualValues(t, 3, arr.Len()) + + elem0, err := arr.Value(0) + require.NoError(t, err) + assert.Equal(t, variant.Object, elem0.Type()) + elemObj0 := elem0.Value().(variant.ObjectValue) + assert.EqualValues(t, 2, elemObj0.NumElements()) + + id, err := elemObj0.ValueByKey("id") + require.NoError(t, err) + assert.Equal(t, variant.Int8, id.Value.Type()) + assert.Equal(t, int8(1), id.Value.Value()) + + elem1, err := arr.Value(1) + require.NoError(t, err) + assert.Equal(t, variant.Null, elem1.Type()) + + elem2, err := arr.Value(2) + require.NoError(t, err) + assert.Equal(t, variant.Object, elem2.Type()) + elemObj2 := elem2.Value().(variant.ObjectValue) + assert.EqualValues(t, 3, elemObj2.NumElements()) + id, err = elemObj2.ValueByKey("id") + require.NoError(t, err) + assert.Equal(t, variant.Int8, id.Value.Type()) + assert.Equal(t, int8(2), id.Value.Value()) + + out, err := json.Marshal(v) + require.NoError(t, err) + expected := `[ + {"id":1, "thing":{"names": ["Contrarian", "Spider"]}}, + null, + {"id":2, "names": ["Apple", "Ray", null], "type": "if"} + ]` + assert.JSONEq(t, expected, string(out)) + }) +} + +func TestInvalidMetadata(t *testing.T) { + tests := []struct { + name string + metadata []byte + errMsg string + }{ + { + name: "empty metadata", + metadata: []byte{}, + errMsg: "too short", + }, + { + name: "unsupported version", + metadata: []byte{0x02, 0x00, 0x00}, // Version != 1 is unsupported + errMsg: "unsupported version", + }, + { + name: "truncated metadata", + metadata: []byte{0x01, 0x05}, // Metadata too short for its header + errMsg: "too short", + }, + { + name: "too short for dict size", + metadata: []byte{0x81, 0x01, 0x00}, // Offset size is 3, not enough bytes + errMsg: "too short for dictionary", + }, + { + name: "key count exceeds metadata size", + metadata: []byte{0x01, 0xFF, 0x00}, // Claims to have many keys but doesn't + errMsg: "out of range", + }, + { + name: "string data out of range", + metadata: []byte{0x01, 0x01, 0x00, 0x05}, + errMsg: "string data out of range", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := variant.NewMetadata(tt.metadata) + require.Error(t, err) + assert.ErrorIs(t, err, variant.ErrInvalidMetadata) + assert.Contains(t, err.Error(), tt.errMsg) + + _, err = variant.New(tt.metadata, []byte{}) + require.Error(t, err) + assert.ErrorIs(t, err, variant.ErrInvalidMetadata) + assert.Contains(t, err.Error(), tt.errMsg) + }) + } +} + +func TestInvalidValue(t *testing.T) { + tests := []struct { + name string + metadata []byte + value []byte + errMsg string + }{ + { + name: "empty value", + metadata: variant.EmptyMetadataBytes[:], + value: []byte{}, + errMsg: "invalid variant value: empty", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := variant.New(tt.metadata, tt.value) + require.Error(t, err) + assert.Contains(t, err.Error(), tt.errMsg) + }) + } +} + +func TestInvalidObjectAccess(t *testing.T) { + v := loadVariant(t, "object_primitive") + obj := v.Value().(variant.ObjectValue) + + t.Run("field_at_out_of_bounds", func(t *testing.T) { + _, err := obj.FieldAt(obj.NumElements()) + require.Error(t, err) + assert.Contains(t, err.Error(), "out of range") + assert.ErrorIs(t, err, arrow.ErrIndex) + }) + + t.Run("corrupt_id", func(t *testing.T) { + // Create a corrupt variant with invalid field ID + objBytes := v.Bytes() + idPosition := 2 // Assumes field ID is at this position - adjust if needed + + // Make a copy so we don't modify the original + corruptBytes := make([]byte, len(objBytes)) + copy(corruptBytes, objBytes) + + // Set field ID to an invalid value + corruptBytes[idPosition] = 0xFF + + corrupt, err := variant.NewWithMetadata(v.Metadata(), corruptBytes) + require.NoError(t, err) + + corruptObj := corrupt.Value().(variant.ObjectValue) + _, err = corruptObj.FieldAt(0) + require.Error(t, err) + assert.Contains(t, err.Error(), "fieldID") + + _, err = corruptObj.ValueByKey("int_field") + require.Error(t, err) + assert.Contains(t, err.Error(), "fieldID") + }) +} + +func TestInvalidArrayAccess(t *testing.T) { + v := loadVariant(t, "array_primitive") + arr := v.Value().(variant.ArrayValue) + + t.Run("out_of_bounds", func(t *testing.T) { + _, err := arr.Value(arr.Len()) + require.Error(t, err) + assert.Contains(t, err.Error(), "out of range") + assert.ErrorIs(t, err, arrow.ErrIndex) + }) + + t.Run("negative_index", func(t *testing.T) { + _, err := arr.Value(uint32(math.MaxUint32)) + require.Error(t, err) + assert.Contains(t, err.Error(), "out of range") + }) +} + +func TestInvalidBuilderOperations(t *testing.T) { + t.Run("invalid_object_size", func(t *testing.T) { + var b variant.Builder + start := b.Offset() + + // Move offset to before start to create invalid size + b.AppendInt(123) + fields := []variant.FieldEntry{{Key: "test", ID: 0, Offset: -10}} + + err := b.FinishObject(start+10, fields) + require.Error(t, err) + assert.Contains(t, err.Error(), "invalid object size") + }) + + t.Run("invalid_array_size", func(t *testing.T) { + var b variant.Builder + start := b.Offset() + + // Move offset to before start to create invalid size + b.AppendInt(123) + offsets := []int{-10} + + err := b.FinishArray(start+10, offsets) + require.Error(t, err) + assert.Contains(t, err.Error(), "invalid array size") + }) + +} + +func TestUnsupportedTypes(t *testing.T) { + var b variant.Builder + + tests := []struct { + name string + value interface{} + }{ + { + name: "complex number", + value: complex(1, 2), + }, + { + name: "function", + value: func() {}, + }, + { + name: "channel", + value: make(chan int), + }, + { + name: "map with non-string keys", + value: map[int]string{ + 1: "test", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := b.Append(tt.value) + require.Error(t, err) + }) + } +} + +func TestDuplicateKeys(t *testing.T) { + t.Run("disallow_duplicates", func(t *testing.T) { + var b variant.Builder + b.SetAllowDuplicates(false) // default, but explicit for test clarity + + start := b.Offset() + fields := make([]variant.FieldEntry, 0) + + fields = append(fields, b.NextField(start, "key")) + require.NoError(t, b.AppendInt(1)) + + fields = append(fields, b.NextField(start, "key")) + require.NoError(t, b.AppendInt(2)) + + err := b.FinishObject(start, fields) + require.Error(t, err) + assert.Contains(t, err.Error(), "disallowed duplicate key") + }) + + t.Run("allow_duplicates", func(t *testing.T) { + var b variant.Builder + b.SetAllowDuplicates(true) + + start := b.Offset() + fields := make([]variant.FieldEntry, 0) + + fields = append(fields, b.NextField(start, "key")) + require.NoError(t, b.AppendInt(1)) + + fields = append(fields, b.NextField(start, "key")) + require.NoError(t, b.AppendInt(2)) + + require.NoError(t, b.FinishObject(start, fields)) + + v, err := b.Build() + require.NoError(t, err) + + obj := v.Value().(variant.ObjectValue) + field, err := obj.ValueByKey("key") + require.NoError(t, err) + assert.Equal(t, int8(2), field.Value.Value()) + }) +} + +func TestValueCloneConsistency(t *testing.T) { + var b variant.Builder + require.NoError(t, b.AppendString("test")) + + v, err := b.Build() + require.NoError(t, err) + + cloned := v.Clone() + + // Reset should invalidate the original value's buffer + b.Reset() + require.NoError(t, b.AppendInt(123)) + + // Original value's buffer is now used for something else + // But the cloned value should still be valid + assert.Equal(t, variant.String, cloned.Type()) + assert.Equal(t, "test", cloned.Value()) +}