From 8505506b5f5eb05e585c26b3ee773b092160ec14 Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Tue, 6 Jan 2026 20:17:40 +0000 Subject: [PATCH 1/4] Add union documentation --- arrow-row/src/lib.rs | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 307281bf9db1..f0a55293a18e 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -415,6 +415,41 @@ mod variable; /// ///``` /// +/// ## Union Encoding +/// +/// A union value is encoded as the type id byte followed by the row encoding of the underlying type. +/// Null values are handled by the underlying type's encoding. +/// +/// For example, given a union of Int32 (type_id = 0) and Utf8 (type_id = 1): +/// +/// ```text +/// ┌──┬──────────────┐ +/// 3 │00│01│80│00│00│03│ +/// └──┴──────────────┘ +/// │ └─ signed integer encoding (non-null) +/// └──── type_id +/// +/// "abc" ┌──┬────────────────────────────────┐ +/// │01│02│'a'│'b'│'c'│00│00│00│00│00│03│ +/// └──┴────────────────────────────────┘ +/// │ └──── string encoding (non-null) +/// └──── type_id +/// +/// null Int32 ┌──┬──────────────┐ +/// │00│00│00│00│00│00│ +/// └──┴──────────────┘ +/// │ └─ signed integer encoding (null) +/// └──── type_id +/// +/// null Utf8 ┌──┬──┐ +/// │01│00│ +/// └──┴──┘ +/// │ └─ string encoding (null) +/// └──── type_id +/// ``` +/// +/// See [`UnionArray`] for more details on union types. +/// /// # Ordering /// /// ## Float Ordering @@ -435,6 +470,11 @@ mod variable; /// /// The order of a given column can be reversed by negating the encoded bytes of non-null values /// +/// ## Union Ordering +/// +/// Values of the same type are ordered according to the ordering of that type. +/// Values of different types are ordered by their type id. +/// /// [COBS]: https://en.wikipedia.org/wiki/Consistent_Overhead_Byte_Stuffing /// [byte stuffing]: https://en.wikipedia.org/wiki/High-Level_Data_Link_Control#Asynchronous_framing #[derive(Debug)] From 634c517b9f71dd584297879c4f08c18c479138fe Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Tue, 6 Jan 2026 22:17:09 +0000 Subject: [PATCH 2/4] Fix alignment --- arrow-row/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index f0a55293a18e..5c6be50df29a 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -432,7 +432,7 @@ mod variable; /// "abc" ┌──┬────────────────────────────────┐ /// │01│02│'a'│'b'│'c'│00│00│00│00│00│03│ /// └──┴────────────────────────────────┘ -/// │ └──── string encoding (non-null) +/// │ └─ string encoding (non-null) /// └──── type_id /// /// null Int32 ┌──┬──────────────┐ From 41cf366c1ae62e3d24f4815d6ab743309ce621d1 Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Tue, 6 Jan 2026 22:22:09 +0000 Subject: [PATCH 3/4] Fix vertical alignment --- arrow-row/src/lib.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 5c6be50df29a..8a367ea5e638 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -429,20 +429,20 @@ mod variable; /// │ └─ signed integer encoding (non-null) /// └──── type_id /// -/// "abc" ┌──┬────────────────────────────────┐ -/// │01│02│'a'│'b'│'c'│00│00│00│00│00│03│ +/// ┌──┬────────────────────────────────┐ +/// "abc" │01│02│'a'│'b'│'c'│00│00│00│00│00│03│ /// └──┴────────────────────────────────┘ /// │ └─ string encoding (non-null) /// └──── type_id /// -/// null Int32 ┌──┬──────────────┐ -/// │00│00│00│00│00│00│ +/// ┌──┬──────────────┐ +/// null Int32 │00│00│00│00│00│00│ /// └──┴──────────────┘ /// │ └─ signed integer encoding (null) /// └──── type_id /// -/// null Utf8 ┌──┬──┐ -/// │01│00│ +/// ┌──┬──┐ +/// null Utf8 │01│00│ /// └──┴──┘ /// │ └─ string encoding (null) /// └──── type_id From 08efd425348c0dc3675d1705e4cca57f9c52343c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 Jan 2026 12:14:33 -0500 Subject: [PATCH 4/4] Note that descending ordering also negates the type_id --- arrow-row/src/lib.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 8a367ea5e638..4cafbc2748ee 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -417,8 +417,8 @@ mod variable; /// /// ## Union Encoding /// -/// A union value is encoded as the type id byte followed by the row encoding of the underlying type. -/// Null values are handled by the underlying type's encoding. +/// A union value is encoded as a single type-id byte followed by the row encoding of the selected child value. +/// The type-id byte is always present; union arrays have no top-level null marker, so nulls are represented by the child encoding. /// /// For example, given a union of Int32 (type_id = 0) and Utf8 (type_id = 1): /// @@ -466,14 +466,15 @@ mod variable; /// The encoding described above will order nulls first, this can be inverted by representing /// nulls as `0xFF_u8` instead of `0_u8` /// -/// ## Reverse Column Ordering -/// -/// The order of a given column can be reversed by negating the encoded bytes of non-null values -/// /// ## Union Ordering /// /// Values of the same type are ordered according to the ordering of that type. /// Values of different types are ordered by their type id. +/// The type_id is negated when descending order is specified. +/// +/// ## Reverse Column Ordering +/// +/// The order of a given column can be reversed by negating the encoded bytes of non-null values /// /// [COBS]: https://en.wikipedia.org/wiki/Consistent_Overhead_Byte_Stuffing /// [byte stuffing]: https://en.wikipedia.org/wiki/High-Level_Data_Link_Control#Asynchronous_framing