From 39fe6fc45d14accf63b7aefed5a8f1225f6b552a Mon Sep 17 00:00:00 2001
From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Date: Mon, 12 Aug 2024 16:49:57 -0400
Subject: [PATCH 001/157] GH-17682: [Go] Bool8 Extension Type Implementation
 (#43323)

### Rationale for this change

Go implementation of #43234

### What changes are included in this PR?

- Go implementation of the `Bool8` extension type
- Minor refactor of existing extension builder interfaces

### Are these changes tested?

Yes, unit tests and basic read/write benchmarks are included.

### Are there any user-facing changes?

- A new extension type is added
- Custom extension builders no longer need another builder created and released separately.

* GitHub Issue: #17682

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Joel Lubinitsky <joellubi@gmail.com>
---
 go/arrow/array/builder.go                 |  11 +-
 go/arrow/array/extension_builder.go       |  10 +-
 go/arrow/extensions/bool8.go              | 216 +++++++++++++++
 go/arrow/extensions/bool8_test.go         | 319 ++++++++++++++++++++++
 go/arrow/extensions/extensions_test.go    | 105 +++++++
 go/internal/types/extension_types.go      |   9 +-
 go/internal/types/extension_types_test.go |  16 +-
 go/parquet/pqarrow/encode_arrow_test.go   |   4 +-
 8 files changed, 663 insertions(+), 27 deletions(-)
 create mode 100644 go/arrow/extensions/bool8.go
 create mode 100644 go/arrow/extensions/bool8_test.go
 create mode 100644 go/arrow/extensions/extensions_test.go

diff --git a/go/arrow/array/builder.go b/go/arrow/array/builder.go
index 6c8ea877a2f..1f4d0ea9635 100644
--- a/go/arrow/array/builder.go
+++ b/go/arrow/array/builder.go
@@ -349,12 +349,13 @@ func NewBuilder(mem memory.Allocator, dtype arrow.DataType) Builder {
 		typ := dtype.(*arrow.LargeListViewType)
 		return NewLargeListViewBuilderWithField(mem, typ.ElemField())
 	case arrow.EXTENSION:
-		typ := dtype.(arrow.ExtensionType)
-		bldr := NewExtensionBuilder(mem, typ)
-		if custom, ok := typ.(ExtensionBuilderWrapper); ok {
-			return custom.NewBuilder(bldr)
+		if custom, ok := dtype.(CustomExtensionBuilder); ok {
+			return custom.NewBuilder(mem)
 		}
-		return bldr
+		if typ, ok := dtype.(arrow.ExtensionType); ok {
+			return NewExtensionBuilder(mem, typ)
+		}
+		panic(fmt.Errorf("arrow/array: invalid extension type: %T", dtype))
 	case arrow.FIXED_SIZE_LIST:
 		typ := dtype.(*arrow.FixedSizeListType)
 		return NewFixedSizeListBuilderWithField(mem, typ.Len(), typ.ElemField())
diff --git a/go/arrow/array/extension_builder.go b/go/arrow/array/extension_builder.go
index a71287faf0e..9c2ee880564 100644
--- a/go/arrow/array/extension_builder.go
+++ b/go/arrow/array/extension_builder.go
@@ -16,8 +16,10 @@
 
 package array
 
-// ExtensionBuilderWrapper is an interface that you need to implement in your custom extension type if you want to provide a customer builder as well.
-// See example in ./arrow/internal/testing/types/extension_types.go
-type ExtensionBuilderWrapper interface {
-	NewBuilder(bldr *ExtensionBuilder) Builder
+import "github.com/apache/arrow/go/v18/arrow/memory"
+
+// CustomExtensionBuilder is an interface that custom extension types may implement to provide a custom builder
+// instead of the underlying storage type's builder when array.NewBuilder is called with that type.
+type CustomExtensionBuilder interface {
+	NewBuilder(memory.Allocator) Builder
 }
diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go
new file mode 100644
index 00000000000..20ab024a2a2
--- /dev/null
+++ b/go/arrow/extensions/bool8.go
@@ -0,0 +1,216 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions
+
+import (
+	"fmt"
+	"reflect"
+	"strconv"
+	"strings"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/memory"
+	"github.com/apache/arrow/go/v18/internal/json"
+)
+
+// Bool8Type represents a logical boolean that is stored using 8 bits.
+type Bool8Type struct {
+	arrow.ExtensionBase
+}
+
+// NewBool8Type creates a new Bool8Type with the underlying storage type set correctly to Int8.
+func NewBool8Type() *Bool8Type {
+	return &Bool8Type{ExtensionBase: arrow.ExtensionBase{Storage: arrow.PrimitiveTypes.Int8}}
+}
+
+func (b *Bool8Type) ArrayType() reflect.Type { return reflect.TypeOf(Bool8Array{}) }
+
+func (b *Bool8Type) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) {
+	if !arrow.TypeEqual(storageType, arrow.PrimitiveTypes.Int8) {
+		return nil, fmt.Errorf("invalid storage type for Bool8Type: %s", storageType.Name())
+	}
+	return NewBool8Type(), nil
+}
+
+func (b *Bool8Type) ExtensionEquals(other arrow.ExtensionType) bool {
+	return b.ExtensionName() == other.ExtensionName()
+}
+
+func (b *Bool8Type) ExtensionName() string { return "arrow.bool8" }
+
+func (b *Bool8Type) Serialize() string { return "" }
+
+func (b *Bool8Type) String() string { return fmt.Sprintf("extension<%s>", b.ExtensionName()) }
+
+func (*Bool8Type) NewBuilder(mem memory.Allocator) array.Builder {
+	return NewBool8Builder(mem)
+}
+
+// Bool8Array is logically an array of boolean values but uses
+// 8 bits to store values instead of 1 bit as in the native BooleanArray.
+type Bool8Array struct {
+	array.ExtensionArrayBase
+}
+
+func (a *Bool8Array) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(array.NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%v", a.Value(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+func (a *Bool8Array) Value(i int) bool {
+	return a.Storage().(*array.Int8).Value(i) != 0
+}
+
+func (a *Bool8Array) BoolValues() []bool {
+	int8s := a.Storage().(*array.Int8).Int8Values()
+	return unsafe.Slice((*bool)(unsafe.Pointer(unsafe.SliceData(int8s))), len(int8s))
+}
+
+func (a *Bool8Array) ValueStr(i int) string {
+	switch {
+	case a.IsNull(i):
+		return array.NullValueStr
+	default:
+		return fmt.Sprint(a.Value(i))
+	}
+}
+
+func (a *Bool8Array) MarshalJSON() ([]byte, error) {
+	values := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		if a.IsValid(i) {
+			values[i] = a.Value(i)
+		}
+	}
+	return json.Marshal(values)
+}
+
+func (a *Bool8Array) GetOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	return a.Value(i)
+}
+
+// boolToInt8 performs the simple scalar conversion of bool to the canonical int8
+// value for the Bool8Type.
+func boolToInt8(v bool) int8 {
+	var res int8
+	if v {
+		res = 1
+	}
+	return res
+}
+
+// Bool8Builder is a convenience builder for the Bool8 extension type,
+// allowing arrays to be built with boolean values rather than the underlying storage type.
+type Bool8Builder struct {
+	*array.ExtensionBuilder
+}
+
+// NewBool8Builder creates a new Bool8Builder, exposing a convenient and efficient interface
+// for writing boolean values to the underlying int8 storage array.
+func NewBool8Builder(mem memory.Allocator) *Bool8Builder {
+	return &Bool8Builder{ExtensionBuilder: array.NewExtensionBuilder(mem, NewBool8Type())}
+}
+
+func (b *Bool8Builder) Append(v bool) {
+	b.ExtensionBuilder.Builder.(*array.Int8Builder).Append(boolToInt8(v))
+}
+
+func (b *Bool8Builder) UnsafeAppend(v bool) {
+	b.ExtensionBuilder.Builder.(*array.Int8Builder).UnsafeAppend(boolToInt8(v))
+}
+
+func (b *Bool8Builder) AppendValueFromString(s string) error {
+	if s == array.NullValueStr {
+		b.AppendNull()
+		return nil
+	}
+
+	val, err := strconv.ParseBool(s)
+	if err != nil {
+		return err
+	}
+
+	b.Append(val)
+	return nil
+}
+
+func (b *Bool8Builder) AppendValues(v []bool, valid []bool) {
+	boolsAsInt8s := unsafe.Slice((*int8)(unsafe.Pointer(unsafe.SliceData(v))), len(v))
+	b.ExtensionBuilder.Builder.(*array.Int8Builder).AppendValues(boolsAsInt8s, valid)
+}
+
+func (b *Bool8Builder) UnmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case bool:
+		b.Append(v)
+		return nil
+	case string:
+		return b.AppendValueFromString(v)
+	case int8:
+		b.ExtensionBuilder.Builder.(*array.Int8Builder).Append(v)
+		return nil
+	case nil:
+		b.AppendNull()
+		return nil
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf([]byte{}),
+			Offset: dec.InputOffset(),
+			Struct: "Bool8Builder",
+		}
+	}
+}
+
+func (b *Bool8Builder) Unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.UnmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+var (
+	_ arrow.ExtensionType          = (*Bool8Type)(nil)
+	_ array.CustomExtensionBuilder = (*Bool8Type)(nil)
+	_ array.ExtensionArray         = (*Bool8Array)(nil)
+	_ array.Builder                = (*Bool8Builder)(nil)
+)
diff --git a/go/arrow/extensions/bool8_test.go b/go/arrow/extensions/bool8_test.go
new file mode 100644
index 00000000000..9f7365d1555
--- /dev/null
+++ b/go/arrow/extensions/bool8_test.go
@@ -0,0 +1,319 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions_test
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
+	"github.com/apache/arrow/go/v18/arrow/ipc"
+	"github.com/apache/arrow/go/v18/arrow/memory"
+	"github.com/apache/arrow/go/v18/internal/json"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+const (
+	MINSIZE = 1024
+	MAXSIZE = 65536
+)
+
+func TestBool8ExtensionBuilder(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(t, 0)
+
+	builder := extensions.NewBool8Builder(mem)
+	defer builder.Release()
+
+	builder.Append(true)
+	builder.AppendNull()
+	builder.Append(false)
+	arr := builder.NewArray()
+	defer arr.Release()
+
+	arrStr := arr.String()
+	require.Equal(t, "[true (null) false]", arrStr)
+
+	jsonStr, err := json.Marshal(arr)
+	require.NoError(t, err)
+
+	arr1, _, err := array.FromJSON(mem, extensions.NewBool8Type(), bytes.NewReader(jsonStr))
+	require.NoError(t, err)
+	defer arr1.Release()
+
+	require.Equal(t, arr, arr1)
+}
+
+func TestBool8ExtensionRecordBuilder(t *testing.T) {
+	schema := arrow.NewSchema([]arrow.Field{
+		{Name: "bool8", Type: extensions.NewBool8Type()},
+	}, nil)
+
+	builder := array.NewRecordBuilder(memory.DefaultAllocator, schema)
+	defer builder.Release()
+
+	builder.Field(0).(*extensions.Bool8Builder).Append(true)
+	record := builder.NewRecord()
+	defer record.Release()
+
+	b, err := record.MarshalJSON()
+	require.NoError(t, err)
+	require.Equal(t, "[{\"bool8\":true}\n]", string(b))
+
+	record1, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(b))
+	require.NoError(t, err)
+	defer record1.Release()
+
+	require.Equal(t, record, record1)
+
+	require.NoError(t, builder.UnmarshalJSON([]byte(`{"bool8":true}`)))
+	record = builder.NewRecord()
+	defer record.Release()
+
+	require.Equal(t, schema, record.Schema())
+	require.Equal(t, true, record.Column(0).(*extensions.Bool8Array).Value(0))
+}
+
+func TestBool8StringRoundTrip(t *testing.T) {
+	// 1. create array
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(t, 0)
+
+	b := extensions.NewBool8Builder(mem)
+	b.Append(true)
+	b.AppendNull()
+	b.Append(false)
+	b.AppendNull()
+	b.Append(true)
+
+	arr := b.NewArray()
+	defer arr.Release()
+
+	// 2. create array via AppendValueFromString
+	b1 := extensions.NewBool8Builder(mem)
+	defer b1.Release()
+
+	for i := 0; i < arr.Len(); i++ {
+		assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i)))
+	}
+
+	arr1 := b1.NewArray()
+	defer arr1.Release()
+
+	assert.True(t, array.Equal(arr, arr1))
+}
+
+func TestCompareBool8AndBoolean(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(t, 0)
+
+	bool8bldr := extensions.NewBool8Builder(mem)
+	defer bool8bldr.Release()
+
+	boolbldr := array.NewBooleanBuilder(mem)
+	defer boolbldr.Release()
+
+	inputVals := []bool{true, false, false, false, true}
+	inputValidity := []bool{true, false, true, false, true}
+
+	bool8bldr.AppendValues(inputVals, inputValidity)
+	bool8Arr := bool8bldr.NewExtensionArray().(*extensions.Bool8Array)
+	defer bool8Arr.Release()
+
+	boolbldr.AppendValues(inputVals, inputValidity)
+	boolArr := boolbldr.NewBooleanArray()
+	defer boolArr.Release()
+
+	require.Equal(t, boolArr.Len(), bool8Arr.Len())
+	for i := 0; i < boolArr.Len(); i++ {
+		require.Equal(t, boolArr.Value(i), bool8Arr.Value(i))
+	}
+}
+
+func TestReinterpretStorageEqualToValues(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(t, 0)
+
+	bool8bldr := extensions.NewBool8Builder(mem)
+	defer bool8bldr.Release()
+
+	inputVals := []bool{true, false, false, false, true}
+	inputValidity := []bool{true, false, true, false, true}
+
+	bool8bldr.AppendValues(inputVals, inputValidity)
+	bool8Arr := bool8bldr.NewExtensionArray().(*extensions.Bool8Array)
+	defer bool8Arr.Release()
+
+	boolValsCopy := make([]bool, bool8Arr.Len())
+	for i := 0; i < bool8Arr.Len(); i++ {
+		boolValsCopy[i] = bool8Arr.Value(i)
+	}
+
+	boolValsZeroCopy := bool8Arr.BoolValues()
+
+	require.Equal(t, len(boolValsZeroCopy), len(boolValsCopy))
+	for i := range boolValsCopy {
+		require.Equal(t, boolValsZeroCopy[i], boolValsCopy[i])
+	}
+}
+
+func TestBool8TypeBatchIPCRoundTrip(t *testing.T) {
+	typ := extensions.NewBool8Type()
+	arrow.RegisterExtensionType(typ)
+	defer arrow.UnregisterExtensionType(typ.ExtensionName())
+
+	storage, _, err := array.FromJSON(memory.DefaultAllocator, arrow.PrimitiveTypes.Int8,
+		strings.NewReader(`[-1, 0, 1, 2, null]`))
+	require.NoError(t, err)
+	defer storage.Release()
+
+	arr := array.NewExtensionArrayWithStorage(typ, storage)
+	defer arr.Release()
+
+	batch := array.NewRecord(arrow.NewSchema([]arrow.Field{{Name: "field", Type: typ, Nullable: true}}, nil),
+		[]arrow.Array{arr}, -1)
+	defer batch.Release()
+
+	var written arrow.Record
+	{
+		var buf bytes.Buffer
+		wr := ipc.NewWriter(&buf, ipc.WithSchema(batch.Schema()))
+		require.NoError(t, wr.Write(batch))
+		require.NoError(t, wr.Close())
+
+		rdr, err := ipc.NewReader(&buf)
+		require.NoError(t, err)
+		written, err = rdr.Read()
+		require.NoError(t, err)
+		written.Retain()
+		defer written.Release()
+		rdr.Release()
+	}
+
+	assert.Truef(t, batch.Schema().Equal(written.Schema()), "expected: %s, got: %s",
+		batch.Schema(), written.Schema())
+
+	assert.Truef(t, array.RecordEqual(batch, written), "expected: %s, got: %s",
+		batch, written)
+}
+
+func BenchmarkWriteBool8Array(b *testing.B) {
+	bool8bldr := extensions.NewBool8Builder(memory.DefaultAllocator)
+	defer bool8bldr.Release()
+
+	for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
+		b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
+
+			values := make([]bool, sz)
+			for idx := range values {
+				values[idx] = true
+			}
+
+			b.ResetTimer()
+			b.SetBytes(int64(sz))
+			for n := 0; n < b.N; n++ {
+				bool8bldr.AppendValues(values, nil)
+				bool8bldr.NewArray()
+			}
+		})
+	}
+}
+
+func BenchmarkWriteBooleanArray(b *testing.B) {
+	boolbldr := array.NewBooleanBuilder(memory.DefaultAllocator)
+	defer boolbldr.Release()
+
+	for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
+		b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
+
+			values := make([]bool, sz)
+			for idx := range values {
+				values[idx] = true
+			}
+
+			b.ResetTimer()
+			b.SetBytes(int64(len(values)))
+			for n := 0; n < b.N; n++ {
+				boolbldr.AppendValues(values, nil)
+				boolbldr.NewArray()
+			}
+		})
+	}
+}
+
+// storage benchmark result at package level to prevent compiler from eliminating the function call
+var result []bool
+
+func BenchmarkReadBool8Array(b *testing.B) {
+	bool8bldr := extensions.NewBool8Builder(memory.DefaultAllocator)
+	defer bool8bldr.Release()
+
+	for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
+		b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
+
+			values := make([]bool, sz)
+			for idx := range values {
+				values[idx] = true
+			}
+
+			bool8bldr.AppendValues(values, nil)
+			bool8Arr := bool8bldr.NewArray().(*extensions.Bool8Array)
+			defer bool8Arr.Release()
+
+			var r []bool
+			b.ResetTimer()
+			b.SetBytes(int64(len(values)))
+			for n := 0; n < b.N; n++ {
+				r = bool8Arr.BoolValues()
+			}
+			result = r
+		})
+	}
+}
+
+func BenchmarkReadBooleanArray(b *testing.B) {
+	boolbldr := array.NewBooleanBuilder(memory.DefaultAllocator)
+	defer boolbldr.Release()
+
+	for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
+		b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
+
+			values := make([]bool, sz)
+			output := make([]bool, sz)
+			for idx := range values {
+				values[idx] = true
+			}
+
+			boolbldr.AppendValues(values, nil)
+			boolArr := boolbldr.NewArray().(*array.Boolean)
+			defer boolArr.Release()
+
+			b.ResetTimer()
+			b.SetBytes(int64(len(values)))
+			for n := 0; n < b.N; n++ {
+				for i := 0; i < boolArr.Len(); i++ {
+					output[i] = boolArr.Value(i)
+				}
+			}
+		})
+	}
+}
diff --git a/go/arrow/extensions/extensions_test.go b/go/arrow/extensions/extensions_test.go
new file mode 100644
index 00000000000..f56fed5e132
--- /dev/null
+++ b/go/arrow/extensions/extensions_test.go
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions_test
+
+import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"testing"
+
+	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
+	"github.com/apache/arrow/go/v18/arrow/memory"
+	"github.com/stretchr/testify/require"
+)
+
+// testBool8Type minimally implements arrow.ExtensionType, but importantly does not implement array.CustomExtensionBuilder
+// so it will fall back to the storage type's default builder.
+type testBool8Type struct {
+	arrow.ExtensionBase
+}
+
+func newTestBool8Type() *testBool8Type {
+	return &testBool8Type{ExtensionBase: arrow.ExtensionBase{Storage: arrow.PrimitiveTypes.Int8}}
+}
+
+func (t *testBool8Type) ArrayType() reflect.Type                  { return reflect.TypeOf(testBool8Array{}) }
+func (t *testBool8Type) ExtensionEquals(arrow.ExtensionType) bool { panic("unimplemented") }
+func (t *testBool8Type) ExtensionName() string                    { panic("unimplemented") }
+func (t *testBool8Type) Serialize() string                        { panic("unimplemented") }
+func (t *testBool8Type) Deserialize(arrow.DataType, string) (arrow.ExtensionType, error) {
+	panic("unimplemented")
+}
+
+type testBool8Array struct {
+	array.ExtensionArrayBase
+}
+
+func TestUnmarshalExtensionTypes(t *testing.T) {
+	logicalJSON := `[true,null,false,null,true]`
+	storageJSON := `[1,null,0,null,1]`
+
+	// extensions.Bool8Type implements array.CustomExtensionBuilder so we expect the array to be built with the custom builder
+	arrCustomBuilder, _, err := array.FromJSON(memory.DefaultAllocator, extensions.NewBool8Type(), bytes.NewBufferString(logicalJSON))
+	require.NoError(t, err)
+	defer arrCustomBuilder.Release()
+	require.Equal(t, 5, arrCustomBuilder.Len())
+
+	// testBoolType falls back to the default builder for the storage type, so it cannot deserialize native booleans
+	_, _, err = array.FromJSON(memory.DefaultAllocator, newTestBool8Type(), bytes.NewBufferString(logicalJSON))
+	require.ErrorContains(t, err, "cannot unmarshal true into Go value of type int8")
+
+	// testBoolType must build the array with the native storage type: Int8
+	arrDefaultBuilder, _, err := array.FromJSON(memory.DefaultAllocator, newTestBool8Type(), bytes.NewBufferString(storageJSON))
+	require.NoError(t, err)
+	defer arrDefaultBuilder.Release()
+	require.Equal(t, 5, arrDefaultBuilder.Len())
+
+	arrBool8, ok := arrCustomBuilder.(*extensions.Bool8Array)
+	require.True(t, ok)
+
+	arrExt, ok := arrDefaultBuilder.(array.ExtensionArray)
+	require.True(t, ok)
+
+	// The physical layout of both arrays is identical
+	require.True(t, array.Equal(arrBool8.Storage(), arrExt.Storage()))
+}
+
+// invalidExtensionType does not fully implement the arrow.ExtensionType interface, even though it embeds arrow.ExtensionBase
+type invalidExtensionType struct {
+	arrow.ExtensionBase
+}
+
+func newInvalidExtensionType() *invalidExtensionType {
+	return &invalidExtensionType{ExtensionBase: arrow.ExtensionBase{Storage: arrow.BinaryTypes.String}}
+}
+
+func TestInvalidExtensionType(t *testing.T) {
+	jsonStr := `["one","two","three"]`
+	typ := newInvalidExtensionType()
+
+	require.PanicsWithError(t, fmt.Sprintf("arrow/array: invalid extension type: %T", typ), func() {
+		array.FromJSON(memory.DefaultAllocator, typ, bytes.NewBufferString(jsonStr))
+	})
+}
+
+var (
+	_ arrow.ExtensionType  = (*testBool8Type)(nil)
+	_ array.ExtensionArray = (*testBool8Array)(nil)
+)
diff --git a/go/internal/types/extension_types.go b/go/internal/types/extension_types.go
index 3c63b368746..85c64d86bff 100644
--- a/go/internal/types/extension_types.go
+++ b/go/internal/types/extension_types.go
@@ -26,6 +26,7 @@ import (
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/memory"
 	"github.com/apache/arrow/go/v18/internal/json"
 	"github.com/google/uuid"
 	"golang.org/x/xerrors"
@@ -37,8 +38,8 @@ type UUIDBuilder struct {
 	*array.ExtensionBuilder
 }
 
-func NewUUIDBuilder(builder *array.ExtensionBuilder) *UUIDBuilder {
-	return &UUIDBuilder{ExtensionBuilder: builder}
+func NewUUIDBuilder(mem memory.Allocator) *UUIDBuilder {
+	return &UUIDBuilder{ExtensionBuilder: array.NewExtensionBuilder(mem, NewUUIDType())}
 }
 
 func (b *UUIDBuilder) Append(v uuid.UUID) {
@@ -245,8 +246,8 @@ func (e *UUIDType) ExtensionEquals(other arrow.ExtensionType) bool {
 	return e.ExtensionName() == other.ExtensionName()
 }
 
-func (*UUIDType) NewBuilder(bldr *array.ExtensionBuilder) array.Builder {
-	return NewUUIDBuilder(bldr)
+func (*UUIDType) NewBuilder(mem memory.Allocator) array.Builder {
+	return NewUUIDBuilder(mem)
 }
 
 // Parametric1Array is a simple int32 array for use with the Parametric1Type
diff --git a/go/internal/types/extension_types_test.go b/go/internal/types/extension_types_test.go
index 50abaae3a9e..65f6353d01b 100644
--- a/go/internal/types/extension_types_test.go
+++ b/go/internal/types/extension_types_test.go
@@ -32,12 +32,10 @@ import (
 
 var testUUID = uuid.New()
 
-func TestExtensionBuilder(t *testing.T) {
+func TestUUIDExtensionBuilder(t *testing.T) {
 	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
 	defer mem.AssertSize(t, 0)
-	extBuilder := array.NewExtensionBuilder(mem, types.NewUUIDType())
-	defer extBuilder.Release()
-	builder := types.NewUUIDBuilder(extBuilder)
+	builder := types.NewUUIDBuilder(mem)
 	builder.Append(testUUID)
 	arr := builder.NewArray()
 	defer arr.Release()
@@ -52,7 +50,7 @@ func TestExtensionBuilder(t *testing.T) {
 	assert.Equal(t, arr, arr1)
 }
 
-func TestExtensionRecordBuilder(t *testing.T) {
+func TestUUIDExtensionRecordBuilder(t *testing.T) {
 	schema := arrow.NewSchema([]arrow.Field{
 		{Name: "uuid", Type: types.NewUUIDType()},
 	}, nil)
@@ -72,9 +70,7 @@ func TestUUIDStringRoundTrip(t *testing.T) {
 	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
 	defer mem.AssertSize(t, 0)
 
-	extBuilder := array.NewExtensionBuilder(mem, types.NewUUIDType())
-	defer extBuilder.Release()
-	b := types.NewUUIDBuilder(extBuilder)
+	b := types.NewUUIDBuilder(mem)
 	b.Append(uuid.Nil)
 	b.AppendNull()
 	b.Append(uuid.NameSpaceURL)
@@ -85,9 +81,7 @@ func TestUUIDStringRoundTrip(t *testing.T) {
 	defer arr.Release()
 
 	// 2. create array via AppendValueFromString
-	extBuilder1 := array.NewExtensionBuilder(mem, types.NewUUIDType())
-	defer extBuilder1.Release()
-	b1 := types.NewUUIDBuilder(extBuilder1)
+	b1 := types.NewUUIDBuilder(mem)
 	defer b1.Release()
 
 	for i := 0; i < arr.Len(); i++ {
diff --git a/go/parquet/pqarrow/encode_arrow_test.go b/go/parquet/pqarrow/encode_arrow_test.go
index 9b3419988d6..16282173a68 100644
--- a/go/parquet/pqarrow/encode_arrow_test.go
+++ b/go/parquet/pqarrow/encode_arrow_test.go
@@ -2053,9 +2053,7 @@ func (ps *ParquetIOTestSuite) TestArrowExtensionTypeRoundTrip() {
 	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
 	defer mem.AssertSize(ps.T(), 0)
 
-	extBuilder := array.NewExtensionBuilder(mem, types.NewUUIDType())
-	defer extBuilder.Release()
-	builder := types.NewUUIDBuilder(extBuilder)
+	builder := types.NewUUIDBuilder(mem)
 	builder.Append(uuid.New())
 	arr := builder.NewArray()
 	defer arr.Release()

From 483bc7b6d10d62e3bb83c167569cde84e2912744 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Tue, 13 Aug 2024 07:45:11 +0530
Subject: [PATCH 002/157] GH-43638: [Java] LargeListViewVector
 RangeEqualVisitor and TypeEqualVisitor integration (#43642)

### Rationale for this change

LargeListViewVector requires `RangeEqualVisitor` and `TypeEqualVisitor` to support the C Data interface.

### What changes are included in this PR?

Adding `RangeEqualVisitor`, `TypeEqualVisitor` and the corresponding test cases.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No
* GitHub Issue: #43638

Authored-by: Vibhatha Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../vector/compare/RangeEqualsVisitor.java    |  57 ++++++++++
 .../vector/compare/TypeEqualsVisitor.java     |   6 ++
 .../arrow/vector/compare/VectorVisitor.java   |   6 ++
 .../vector/complex/LargeListViewVector.java   |   2 +-
 .../apache/arrow/vector/TestValueVector.java  |  95 ++++++++++++++++
 .../compare/TestRangeEqualsVisitor.java       | 102 ++++++++++++++++++
 .../vector/compare/TestTypeEqualsVisitor.java |  17 +++
 7 files changed, 284 insertions(+), 1 deletion(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
index fbc28a3609c..9aa1bffb846 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
@@ -31,11 +31,13 @@
 import org.apache.arrow.vector.ExtensionTypeVector;
 import org.apache.arrow.vector.NullVector;
 import org.apache.arrow.vector.ValueVector;
+import org.apache.arrow.vector.complex.BaseLargeRepeatedValueViewVector;
 import org.apache.arrow.vector.complex.BaseRepeatedValueVector;
 import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector;
 import org.apache.arrow.vector.complex.DenseUnionVector;
 import org.apache.arrow.vector.complex.FixedSizeListVector;
 import org.apache.arrow.vector.complex.LargeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.NonNullableStructVector;
@@ -244,6 +246,14 @@ public Boolean visit(ListViewVector left, Range range) {
     return compareListViewVectors(range);
   }
 
+  @Override
+  public Boolean visit(LargeListViewVector left, Range range) {
+    if (!validate(left)) {
+      return false;
+    }
+    return compareLargeListViewVectors(range);
+  }
+
   protected RangeEqualsVisitor createInnerVisitor(
       ValueVector leftInner,
       ValueVector rightInner,
@@ -759,4 +769,51 @@ protected boolean compareListViewVectors(Range range) {
     }
     return true;
   }
+
+  protected boolean compareLargeListViewVectors(Range range) {
+    LargeListViewVector leftVector = (LargeListViewVector) left;
+    LargeListViewVector rightVector = (LargeListViewVector) right;
+
+    RangeEqualsVisitor innerVisitor =
+        createInnerVisitor(
+            leftVector.getDataVector(), rightVector.getDataVector(), /*type comparator*/ null);
+    Range innerRange = new Range();
+
+    for (int i = 0; i < range.getLength(); i++) {
+      int leftIndex = range.getLeftStart() + i;
+      int rightIndex = range.getRightStart() + i;
+
+      boolean isNull = leftVector.isNull(leftIndex);
+      if (isNull != rightVector.isNull(rightIndex)) {
+        return false;
+      }
+
+      int offsetWidth = BaseLargeRepeatedValueViewVector.OFFSET_WIDTH;
+      int sizeWidth = BaseLargeRepeatedValueViewVector.SIZE_WIDTH;
+
+      if (!isNull) {
+        final int startIndexLeft =
+            leftVector.getOffsetBuffer().getInt((long) leftIndex * offsetWidth);
+        final int leftSize = leftVector.getSizeBuffer().getInt((long) leftIndex * sizeWidth);
+
+        final int startIndexRight =
+            rightVector.getOffsetBuffer().getInt((long) rightIndex * offsetWidth);
+        final int rightSize = rightVector.getSizeBuffer().getInt((long) rightIndex * sizeWidth);
+
+        if (leftSize != rightSize) {
+          return false;
+        }
+
+        innerRange =
+            innerRange
+                .setRightStart(startIndexRight)
+                .setLeftStart(startIndexLeft)
+                .setLength(leftSize);
+        if (!innerVisitor.rangeEquals(innerRange)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
 }
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java
index 6e15d6a83e7..ce92b22ef61 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java
@@ -28,6 +28,7 @@
 import org.apache.arrow.vector.complex.DenseUnionVector;
 import org.apache.arrow.vector.complex.FixedSizeListVector;
 import org.apache.arrow.vector.complex.LargeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.NonNullableStructVector;
@@ -130,6 +131,11 @@ public Boolean visit(ListViewVector left, Void value) {
     return compareField(left.getField(), right.getField());
   }
 
+  @Override
+  public Boolean visit(LargeListViewVector left, Void value) {
+    return compareField(left.getField(), right.getField());
+  }
+
   private boolean compareField(Field leftField, Field rightField) {
 
     if (leftField == rightField) {
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java
index c912359d4af..e20f8cd9cfb 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java
@@ -25,6 +25,7 @@
 import org.apache.arrow.vector.complex.DenseUnionVector;
 import org.apache.arrow.vector.complex.FixedSizeListVector;
 import org.apache.arrow.vector.complex.LargeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.NonNullableStructVector;
@@ -65,4 +66,9 @@ public interface VectorVisitor<OUT, IN> {
   default OUT visit(ListViewVector left, IN value) {
     throw new UnsupportedOperationException("VectorVisitor for ListViewVector is not supported.");
   }
+
+  default OUT visit(LargeListViewVector left, IN value) {
+    throw new UnsupportedOperationException(
+        "VectorVisitor for LargeListViewVector is not supported.");
+  }
 }
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
index 1bb24a53fc2..17ccdbf0eae 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
@@ -449,7 +449,7 @@ public int hashCode(int index, ArrowBufHasher hasher) {
 
   @Override
   public <OUT, IN> OUT accept(VectorVisitor<OUT, IN> visitor, IN value) {
-    throw new UnsupportedOperationException();
+    return visitor.visit(this, value);
   }
 
   @Override
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
index 4dd55afdb8b..83e470ae258 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
@@ -46,11 +46,13 @@
 import org.apache.arrow.vector.compare.VectorEqualsVisitor;
 import org.apache.arrow.vector.complex.DenseUnionVector;
 import org.apache.arrow.vector.complex.FixedSizeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.complex.UnionVector;
 import org.apache.arrow.vector.complex.impl.NullableStructWriter;
+import org.apache.arrow.vector.complex.impl.UnionLargeListViewWriter;
 import org.apache.arrow.vector.complex.impl.UnionListViewWriter;
 import org.apache.arrow.vector.complex.impl.UnionListWriter;
 import org.apache.arrow.vector.holders.NullableIntHolder;
@@ -2910,6 +2912,35 @@ public void testListViewVectorEqualsWithNull() {
     }
   }
 
+  @Test
+  public void testLargeListViewVectorEqualsWithNull() {
+    try (final LargeListViewVector vector1 = LargeListViewVector.empty("largelistview", allocator);
+        final LargeListViewVector vector2 =
+            LargeListViewVector.empty("largelistview", allocator); ) {
+
+      UnionLargeListViewWriter writer1 = vector1.getWriter();
+      writer1.allocate();
+
+      // set some values
+      writeLargeListViewVector(writer1, new int[] {1, 2});
+      writeLargeListViewVector(writer1, new int[] {3, 4});
+      writeLargeListViewVector(writer1, new int[] {});
+      writer1.setValueCount(3);
+
+      UnionLargeListViewWriter writer2 = vector2.getWriter();
+      writer2.allocate();
+
+      // set some values
+      writeLargeListViewVector(writer2, new int[] {1, 2});
+      writeLargeListViewVector(writer2, new int[] {3, 4});
+      writer2.setValueCount(3);
+
+      VectorEqualsVisitor visitor = new VectorEqualsVisitor();
+
+      assertFalse(visitor.vectorEquals(vector1, vector2));
+    }
+  }
+
   @Test
   public void testListVectorEquals() {
     try (final ListVector vector1 = ListVector.empty("list", allocator);
@@ -2974,6 +3005,39 @@ public void testListViewVectorEquals() {
     }
   }
 
+  @Test
+  public void testLargeListViewVectorEquals() {
+    try (final LargeListViewVector vector1 = LargeListViewVector.empty("largelistview", allocator);
+        final LargeListViewVector vector2 =
+            LargeListViewVector.empty("largelistview", allocator); ) {
+
+      UnionLargeListViewWriter writer1 = vector1.getWriter();
+      writer1.allocate();
+
+      // set some values
+      writeLargeListViewVector(writer1, new int[] {1, 2});
+      writeLargeListViewVector(writer1, new int[] {3, 4});
+      writeLargeListViewVector(writer1, new int[] {5, 6});
+      writer1.setValueCount(3);
+
+      UnionLargeListViewWriter writer2 = vector2.getWriter();
+      writer2.allocate();
+
+      // set some values
+      writeLargeListViewVector(writer2, new int[] {1, 2});
+      writeLargeListViewVector(writer2, new int[] {3, 4});
+      writer2.setValueCount(2);
+
+      VectorEqualsVisitor visitor = new VectorEqualsVisitor();
+      assertFalse(visitor.vectorEquals(vector1, vector2));
+
+      writeLargeListViewVector(writer2, new int[] {5, 6});
+      writer2.setValueCount(3);
+
+      assertTrue(visitor.vectorEquals(vector1, vector2));
+    }
+  }
+
   @Test
   public void testListVectorSetNull() {
     try (final ListVector vector = ListVector.empty("list", allocator)) {
@@ -3020,6 +3084,29 @@ public void testListViewVectorSetNull() {
     }
   }
 
+  @Test
+  public void testLargeListViewVectorSetNull() {
+    try (final LargeListViewVector vector = LargeListViewVector.empty("largelistview", allocator)) {
+      UnionLargeListViewWriter writer = vector.getWriter();
+      writer.allocate();
+
+      writeLargeListViewVector(writer, new int[] {1, 2});
+      writeLargeListViewVector(writer, new int[] {3, 4});
+      writeLargeListViewVector(writer, new int[] {5, 6});
+      vector.setNull(3);
+      vector.setNull(4);
+      vector.setNull(5);
+      writer.setValueCount(6);
+
+      assertEquals(vector.getObject(0), Arrays.asList(1, 2));
+      assertEquals(vector.getObject(1), Arrays.asList(3, 4));
+      assertEquals(vector.getObject(2), Arrays.asList(5, 6));
+      assertTrue(vector.isNull(3));
+      assertTrue(vector.isNull(4));
+      assertTrue(vector.isNull(5));
+    }
+  }
+
   @Test
   public void testStructVectorEqualsWithNull() {
 
@@ -3359,6 +3446,14 @@ private void writeListViewVector(UnionListViewWriter writer, int[] values) {
     writer.endListView();
   }
 
+  private void writeLargeListViewVector(UnionLargeListViewWriter writer, int[] values) {
+    writer.startListView();
+    for (int v : values) {
+      writer.integer().writeInt(v);
+    }
+    writer.endListView();
+  }
+
   @Test
   public void testVariableVectorGetEndOffset() {
     try (final VarCharVector vector1 = new VarCharVector("v1", allocator);
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java
index 7e91b760430..eca5c2d9b2a 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java
@@ -36,12 +36,14 @@
 import org.apache.arrow.vector.compare.util.ValueEpsilonEqualizers;
 import org.apache.arrow.vector.complex.DenseUnionVector;
 import org.apache.arrow.vector.complex.FixedSizeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.complex.UnionVector;
 import org.apache.arrow.vector.complex.impl.NullableStructWriter;
 import org.apache.arrow.vector.complex.impl.UnionFixedSizeListWriter;
+import org.apache.arrow.vector.complex.impl.UnionLargeListViewWriter;
 import org.apache.arrow.vector.complex.impl.UnionListViewWriter;
 import org.apache.arrow.vector.complex.impl.UnionListWriter;
 import org.apache.arrow.vector.holders.NullableBigIntHolder;
@@ -221,6 +223,25 @@ public void testListViewVectorWithDifferentChild() {
     }
   }
 
+  @Test
+  public void testLargeListViewVectorWithDifferentChild() {
+    try (final LargeListViewVector vector1 = LargeListViewVector.empty("largelistview", allocator);
+        final LargeListViewVector vector2 =
+            LargeListViewVector.empty("largelistview", allocator); ) {
+
+      vector1.allocateNew();
+      vector1.initializeChildrenFromFields(
+          Arrays.asList(Field.nullable("child", new ArrowType.Int(32, true))));
+
+      vector2.allocateNew();
+      vector2.initializeChildrenFromFields(
+          Arrays.asList(Field.nullable("child", new ArrowType.Int(64, true))));
+
+      RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2);
+      assertFalse(visitor.rangeEquals(new Range(0, 0, 0)));
+    }
+  }
+
   @Test
   public void testListVectorRangeEquals() {
     try (final ListVector vector1 = ListVector.empty("list", allocator);
@@ -285,6 +306,39 @@ public void testListViewVectorRangeEquals() {
     }
   }
 
+  @Test
+  public void testLargeListViewVectorRangeEquals() {
+    try (final LargeListViewVector vector1 = LargeListViewVector.empty("largelistview", allocator);
+        final LargeListViewVector vector2 =
+            LargeListViewVector.empty("largelistview", allocator); ) {
+
+      UnionLargeListViewWriter writer1 = vector1.getWriter();
+      writer1.allocate();
+
+      // set some values
+      writeLargeListViewVector(writer1, new int[] {1, 2});
+      writeLargeListViewVector(writer1, new int[] {3, 4});
+      writeLargeListViewVector(writer1, new int[] {5, 6});
+      writeLargeListViewVector(writer1, new int[] {7, 8});
+      writeLargeListViewVector(writer1, new int[] {9, 10});
+      writer1.setValueCount(5);
+
+      UnionLargeListViewWriter writer2 = vector2.getWriter();
+      writer2.allocate();
+
+      // set some values
+      writeLargeListViewVector(writer2, new int[] {0, 0});
+      writeLargeListViewVector(writer2, new int[] {3, 4});
+      writeLargeListViewVector(writer2, new int[] {5, 6});
+      writeLargeListViewVector(writer2, new int[] {7, 8});
+      writeLargeListViewVector(writer2, new int[] {0, 0});
+      writer2.setValueCount(5);
+
+      RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2);
+      assertTrue(visitor.rangeEquals(new Range(1, 1, 3)));
+    }
+  }
+
   @Test
   public void testBitVectorRangeEquals() {
     try (final BitVector vector1 = new BitVector("v1", allocator);
@@ -903,6 +957,38 @@ public void testListViewVectorApproxEquals() {
     }
   }
 
+  @Test
+  public void testLargeListViewVectorApproxEquals() {
+    try (final LargeListViewVector right = LargeListViewVector.empty("largelistview", allocator);
+        final LargeListViewVector left1 = LargeListViewVector.empty("largelistview", allocator);
+        final LargeListViewVector left2 = LargeListViewVector.empty("largelistview", allocator); ) {
+
+      final float epsilon = 1.0E-6f;
+
+      UnionLargeListViewWriter rightWriter = right.getWriter();
+      rightWriter.allocate();
+      writeLargeListViewVector(rightWriter, new double[] {1, 2});
+      writeLargeListViewVector(rightWriter, new double[] {1.01, 2.02});
+      rightWriter.setValueCount(2);
+
+      UnionLargeListViewWriter leftWriter1 = left1.getWriter();
+      leftWriter1.allocate();
+      writeLargeListViewVector(leftWriter1, new double[] {1, 2});
+      writeLargeListViewVector(leftWriter1, new double[] {1.01 + epsilon / 2, 2.02 - epsilon / 2});
+      leftWriter1.setValueCount(2);
+
+      UnionLargeListViewWriter leftWriter2 = left2.getWriter();
+      leftWriter2.allocate();
+      writeLargeListViewVector(leftWriter2, new double[] {1, 2});
+      writeLargeListViewVector(leftWriter2, new double[] {1.01 + epsilon * 2, 2.02 - epsilon * 2});
+      leftWriter2.setValueCount(2);
+
+      Range range = new Range(0, 0, right.getValueCount());
+      assertTrue(new ApproxEqualsVisitor(left1, right, epsilon, epsilon).rangeEquals(range));
+      assertFalse(new ApproxEqualsVisitor(left2, right, epsilon, epsilon).rangeEquals(range));
+    }
+  }
+
   private void writeStructVector(NullableStructWriter writer, int value1, long value2) {
     writer.start();
     writer.integer("f0").writeInt(value1);
@@ -933,6 +1019,14 @@ private void writeListViewVector(UnionListViewWriter writer, int[] values) {
     writer.endListView();
   }
 
+  private void writeLargeListViewVector(UnionLargeListViewWriter writer, int[] values) {
+    writer.startListView();
+    for (int v : values) {
+      writer.integer().writeInt(v);
+    }
+    writer.endListView();
+  }
+
   private void writeFixedSizeListVector(UnionFixedSizeListWriter writer, int[] values) {
     writer.startList();
     for (int v : values) {
@@ -956,4 +1050,12 @@ private void writeListViewVector(UnionListViewWriter writer, double[] values) {
     }
     writer.endListView();
   }
+
+  private void writeLargeListViewVector(UnionLargeListViewWriter writer, double[] values) {
+    writer.startListView();
+    for (double v : values) {
+      writer.float8().writeFloat8(v);
+    }
+    writer.endListView();
+  }
 }
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java
index d65096205fd..ce029493473 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java
@@ -32,6 +32,7 @@
 import org.apache.arrow.vector.ViewVarBinaryVector;
 import org.apache.arrow.vector.ViewVarCharVector;
 import org.apache.arrow.vector.complex.DenseUnionVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.StructVector;
@@ -121,6 +122,22 @@ public void testListViewTypeEquals() {
     }
   }
 
+  @Test
+  public void testLargeListViewTypeEquals() {
+    try (final LargeListViewVector right = LargeListViewVector.empty("largelistview", allocator);
+        final LargeListViewVector left1 = LargeListViewVector.empty("largelistview", allocator);
+        final LargeListViewVector left2 = LargeListViewVector.empty("largelistview", allocator)) {
+
+      right.addOrGetVector(FieldType.nullable(new ArrowType.Utf8()));
+      left1.addOrGetVector(FieldType.nullable(new ArrowType.Utf8()));
+      left2.addOrGetVector(FieldType.nullable(new ArrowType.FixedSizeBinary(2)));
+
+      TypeEqualsVisitor visitor = new TypeEqualsVisitor(right);
+      assertTrue(visitor.equals(left1));
+      assertFalse(visitor.equals(left2));
+    }
+  }
+
   @Test
   public void testStructTypeEquals() {
     try (final StructVector right = StructVector.empty("struct", allocator);

From e8e9d1ac2b9761b40eb0e041127285b55655e49c Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Wed, 14 Aug 2024 01:46:02 +0200
Subject: [PATCH 003/157] GH-43536: [Python] Declare support for free-threading
 in Cython (#43606)

### Rationale for this change

This is done by passing an extra flag when building the Cython extension modules. It is needed so that the GIL is not dynamically reenabled when importing `pyarrow.lib`.

### What changes are included in this PR?

Changes to CMake so that the extra flag is passed when building Cython extension modules.

* GitHub Issue: #43536

Lead-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/cmake_modules/UseCython.cmake | 5 +++++
 python/CMakeLists.txt             | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/cpp/cmake_modules/UseCython.cmake b/cpp/cmake_modules/UseCython.cmake
index e15ac59490c..7d88daa4fad 100644
--- a/cpp/cmake_modules/UseCython.cmake
+++ b/cpp/cmake_modules/UseCython.cmake
@@ -184,4 +184,9 @@ function(cython_add_module _name pyx_target_name generated_files)
   add_dependencies(${_name} ${pyx_target_name})
 endfunction()
 
+execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from Cython.Compiler.Version import version; print(version)"
+                OUTPUT_VARIABLE CYTHON_VERSION_OUTPUT
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
+set(CYTHON_VERSION "${CYTHON_VERSION_OUTPUT}")
+
 include(CMakeParseArguments)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index a90dee70584..5d5eeaf8157 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -260,6 +260,7 @@ message(STATUS "Found NumPy version: ${Python3_NumPy_VERSION}")
 message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}")
 
 include(UseCython)
+message(STATUS "Found Cython version: ${CYTHON_VERSION}")
 
 # Arrow C++ and set default PyArrow build options
 include(GNUInstallDirs)
@@ -855,6 +856,10 @@ set(CYTHON_FLAGS "${CYTHON_FLAGS}" "--warning-errors")
 # undocumented Cython feature.
 set(CYTHON_FLAGS "${CYTHON_FLAGS}" "--no-c-in-traceback")
 
+if(CYTHON_VERSION VERSION_GREATER_EQUAL "3.1.0a0")
+  list(APPEND CYTHON_FLAGS "-Xfreethreading_compatible=True")
+endif()
+
 foreach(module ${CYTHON_EXTENSIONS})
   string(REPLACE "." ";" directories ${module})
   list(GET directories -1 module_name)

From fc80d7d8b9f80152415fc333e0850358bf217db9 Mon Sep 17 00:00:00 2001
From: Dane Pitkin <dpitkin@apache.org>
Date: Tue, 13 Aug 2024 19:48:47 -0400
Subject: [PATCH 004/157] GH-43378: [Java][CI] Don't configure multithreading
 when building javadocs (#43674)

### Rationale for this change

Apparently some maven plugins are not thread safe and started throwing errors in the `test-debian-12-docs` CI job when building javadocs.

### What changes are included in this PR?

* Remove multithreading config when building javadocs

### Are these changes tested?

CI

### Are there any user-facing changes?

No
* GitHub Issue: #43378

Authored-by: Dane Pitkin <dpitkin@apache.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ci/scripts/java_build.sh | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh
index 0fa1edab429..212ec6eb114 100755
--- a/ci/scripts/java_build.sh
+++ b/ci/scripts/java_build.sh
@@ -72,9 +72,6 @@ if [ $ARROW_JAVA_SKIP_GIT_PLUGIN ]; then
   mvn="${mvn} -Dmaven.gitcommitid.skip=true"
 fi
 
-# Use `2 * ncores` threads
-mvn="${mvn} -T 2C"
-
 # https://github.com/apache/arrow/issues/41429
 # TODO: We want to out-of-source build. This is a workaround. We copy
 # all needed files to the build directory from the source directory
@@ -98,10 +95,12 @@ if [ "${ARROW_JAVA_JNI}" = "ON" ]; then
   mvn="${mvn} -Darrow.cpp.build.dir=${java_jni_dist_dir} -Parrow-jni"
 fi
 
-${mvn} clean install
+# Use `2 * ncores` threads
+${mvn} -T 2C clean install
 
 if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then
   # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633
+  # GH-43378: Maven site plugins not compatible with multithreading
   mkdir -p ${build_dir}/docs/java/reference
   ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site
   rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference

From 88e8140ad7902435b5d1ac29205dda7517f2cc79 Mon Sep 17 00:00:00 2001
From: Oliver Layer <o.layer@celonis.de>
Date: Wed, 14 Aug 2024 02:16:54 +0200
Subject: [PATCH 005/157] GH-43097: [C++] Implement `PathFromUri` support for
 Azure file system (#43098)

### Rationale for this change

See #43097.

### What changes are included in this PR?
Implements `AzureFS::PathFromUri` using existing URI parsing and path extraction inside the `AzureOptions`.

### Are these changes tested?
Yes, added a unit test.

### Are there any user-facing changes?
No, but calling `PathFromUri` will now work instead of throwing due to no implementation provided.
* GitHub Issue: #43097

Authored-by: Oliver Layer <o.layer@celonis.de>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/src/arrow/filesystem/azurefs.cc      | 27 ++++++++++++++++++++++++
 cpp/src/arrow/filesystem/azurefs.h       |  2 ++
 cpp/src/arrow/filesystem/azurefs_test.cc |  9 ++++++++
 3 files changed, 38 insertions(+)

diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc
index a3aa2c8e837..9b3c0c0c1d7 100644
--- a/cpp/src/arrow/filesystem/azurefs.cc
+++ b/cpp/src/arrow/filesystem/azurefs.cc
@@ -3199,4 +3199,31 @@ Result<std::shared_ptr<io::OutputStream>> AzureFileSystem::OpenAppendStream(
   return impl_->OpenAppendStream(location, metadata, false, this);
 }
 
+Result<std::string> AzureFileSystem::PathFromUri(const std::string& uri_string) const {
+  /// We can not use `internal::PathFromUriHelper` here because for Azure we have to
+  /// support different URI schemes where the authority is handled differently.
+  /// Example (both should yield the same path `container/some/path`):
+  ///   - (1) abfss://storageacc.blob.core.windows.net/container/some/path
+  ///   - (2) abfss://acc:pw@container/some/path
+  /// The authority handling is different with these two URIs. (1) requires no prepending
+  /// of the authority to the path, while (2) requires to preprend the authority to the
+  /// path.
+  std::string path;
+  Uri uri;
+  RETURN_NOT_OK(uri.Parse(uri_string));
+  RETURN_NOT_OK(AzureOptions::FromUri(uri, &path));
+
+  std::vector<std::string> supported_schemes = {"abfs", "abfss"};
+  const auto scheme = uri.scheme();
+  if (std::find(supported_schemes.begin(), supported_schemes.end(), scheme) ==
+      supported_schemes.end()) {
+    std::string expected_schemes =
+        ::arrow::internal::JoinStrings(supported_schemes, ", ");
+    return Status::Invalid("The filesystem expected a URI with one of the schemes (",
+                           expected_schemes, ") but received ", uri_string);
+  }
+
+  return path;
+}
+
 }  // namespace arrow::fs
diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h
index 93d6ec2f945..072b061eeb2 100644
--- a/cpp/src/arrow/filesystem/azurefs.h
+++ b/cpp/src/arrow/filesystem/azurefs.h
@@ -367,6 +367,8 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem {
   Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
       const std::string& path,
       const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
 };
 
 }  // namespace arrow::fs
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc
index 9a11a6f2499..36646f417cb 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -2958,5 +2958,14 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileClosed) {
   ASSERT_RAISES(Invalid, stream->ReadAt(1, 1));
   ASSERT_RAISES(Invalid, stream->Seek(2));
 }
+
+TEST_F(TestAzuriteFileSystem, PathFromUri) {
+  ASSERT_EQ(
+      "container/some/path",
+      fs()->PathFromUri("abfss://storageacc.blob.core.windows.net/container/some/path"));
+  ASSERT_EQ("container/some/path",
+            fs()->PathFromUri("abfss://acc:pw@container/some/path"));
+  ASSERT_RAISES(Invalid, fs()->PathFromUri("http://acc:pw@container/some/path"));
+}
 }  // namespace fs
 }  // namespace arrow

From 01fd7fc18ca737edf0afbcc6afa349206b055a09 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 14 Aug 2024 09:26:27 +0900
Subject: [PATCH 006/157] MINOR: [Go] Bump github.com/substrait-io/substrait-go
 from 0.5.0 to 0.6.0 in /go (#43647)

Bumps [github.com/substrait-io/substrait-go](https://github.com/substrait-io/substrait-go) from 0.5.0 to 0.6.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/substrait-io/substrait-go/releases">github.com/substrait-io/substrait-go's releases</a>.</em></p>
<blockquote>
<h1>v0.6.0 (2024-08-11)</h1>
<h3>Features</h3>
<ul>
<li><strong><code>type</code></strong> add support for type PrecisionTimestamp and PrecisionTimestampTz (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/41">#41</a>) (<a href="https://github.com/substrait-io/substrait-go/commit/5040d09319e2ec3067da2ee0f1f354cc07e8a41a">5040d09</a>)</li>
<li><strong><code>substrait</code></strong> Update to Substrait v0.53.0 (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/40">#40</a>) (<a href="https://github.com/substrait-io/substrait-go/commit/0ea5482e061033854f9931e2134a1bf91a5bbb54">0ea5482</a>)
<blockquote>
<ul>
<li>Update substrait dependency to v0.53.0</li>
<li>Accommodate UserDefined Literal changes where literal value became oneof in proto instead of direct value</li>
<li>Fix AdvanceExtension interface to accommodate breaking change in AdvanceExtensionProto</li>
<li>Add linter to ignore internal use of deprecated methods.</li>
</ul>
</blockquote>
</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/substrait-io/substrait-go/commit/5040d09319e2ec3067da2ee0f1f354cc07e8a41a"><code>5040d09</code></a> feat(type): add support for type PrecisionTimestamp and PrecisionTimestampTz ...</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/0ea5482e061033854f9931e2134a1bf91a5bbb54"><code>0ea5482</code></a> feat(substrait): Update to Substrait v0.53.0 (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/40">#40</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/2fc8f586848be8a97ba473c6172a95a82d5c943e"><code>2fc8f58</code></a> ci(build-test): Use grep to exclude protobuf from coverage report (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/38">#38</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/b3aa515f9b50a728d8404e3b8113f2d3528df928"><code>b3aa515</code></a> ci(build-test): Update codecov to ignore protobuf files</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/15314a88001ef860031092b8c78b2e3cc06f2e62"><code>15314a8</code></a> ci(build-test): Add codecov and release branch action badges. (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/36">#36</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/663c26d98efa6578b96ef1e04092625bcc5498b8"><code>663c26d</code></a> ci(build-test): Add codecov reports (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/35">#35</a>)</li>
<li>See full diff in <a href="https://github.com/substrait-io/substrait-go/compare/v0.5.0...v0.6.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/substrait-io/substrait-go&package-manager=go_modules&previous-version=0.5.0&new-version=0.6.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 go/go.mod | 2 +-
 go/go.sum | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/go/go.mod b/go/go.mod
index 09869b7a383..9f4222a541b 100644
--- a/go/go.mod
+++ b/go/go.mod
@@ -49,7 +49,7 @@ require (
 	github.com/google/uuid v1.6.0
 	github.com/hamba/avro/v2 v2.24.1
 	github.com/huandu/xstrings v1.4.0
-	github.com/substrait-io/substrait-go v0.5.0
+	github.com/substrait-io/substrait-go v0.6.0
 	github.com/tidwall/sjson v1.2.5
 )
 
diff --git a/go/go.sum b/go/go.sum
index 2e89a769024..c7eb3a66dee 100644
--- a/go/go.sum
+++ b/go/go.sum
@@ -24,8 +24,8 @@ github.com/go-playground/locales v0.13.0 h1:HyWk6mgj5qFqCT5fjGBuRArbVDfE4hi8+e8c
 github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8=
 github.com/go-playground/universal-translator v0.17.0 h1:icxd5fm+REJzpZx7ZfpaD876Lmtgy7VtROAbHHXk8no=
 github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA=
-github.com/go-playground/validator/v10 v10.4.1 h1:pH2c5ADXtd66mxoE0Zm9SUhxE20r7aM3F26W0hOn+GE=
-github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn9GlaMV7XkbRSipzJ0Ii4=
+github.com/go-playground/validator/v10 v10.11.1 h1:prmOlTVv+YjZjmRmNSF3VmspqJIxJWXmqUsHwfTRRkQ=
+github.com/go-playground/validator/v10 v10.11.1/go.mod h1:i+3WkQ1FvaUjjxh1kSvIA4dMGDBiPU55YFDl0WbKdWU=
 github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA=
 github.com/goccy/go-json v0.10.3/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
 github.com/goccy/go-yaml v1.11.0 h1:n7Z+zx8S9f9KgzG6KtQKf+kwqXZlLNR2F6018Dgau54=
@@ -99,8 +99,8 @@ github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
 github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
-github.com/substrait-io/substrait-go v0.5.0 h1:8sYsoqcrzoNpThPyot1CQpwF6OokxvplLUQJTGlKws4=
-github.com/substrait-io/substrait-go v0.5.0/go.mod h1:Co7ko6iIjdqCGcN3LfkKWPVlxONkNZem9omWAGIaOrQ=
+github.com/substrait-io/substrait-go v0.6.0 h1:n2G/SGmrn7U5Q39VA8WeM2UfVL5Y/6HX8WAP9uJLNk4=
+github.com/substrait-io/substrait-go v0.6.0/go.mod h1:cl8Wsc7aBPDfcHp9+OrUqGpjkgrYlhcDsH/lMP6KUZA=
 github.com/tidwall/gjson v1.14.2 h1:6BBkirS0rAHjumnjHF6qgy5d2YAJ1TLIaFE2lzfOLqo=
 github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
 github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=

From 69bce8f0cd02297ecc31caef22db67e654c16e28 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Tue, 13 Aug 2024 21:27:36 -0300
Subject: [PATCH 007/157] GH-43677: [C++][FlightRPC] Move the FlightTestServer
 to its own .cc and .h files (#43678)

### Rationale for this change

One way of learning about a codebase is reading the tests. As it is now, it's hard to see the minimal `FlightServerBase` sub-class in `flight/test_util.cc`, so I moved it to its own file.

### What changes are included in this PR?

 - Renaming `FlightTestServer` to `TestFlightServer`
 - Moving the class to `test_flight_server.{h,cc}`
 - Bonus: Moving the server and client auth handlers to `test_auth_handlers.{h,cc}`

### Are these changes tested?

By existing tests.

### Are there any user-facing changes?

`ExampleTestServer` is removed from the testing library in favor of `FlightTestServer::Make`.
* GitHub Issue: #43677

Authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Signed-off-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
---
 cpp/src/arrow/flight/CMakeLists.txt           |   2 +
 cpp/src/arrow/flight/flight_test.cc           |   8 +-
 .../integration_tests/test_integration.cc     |   1 +
 cpp/src/arrow/flight/test_auth_handlers.cc    | 141 +++++
 cpp/src/arrow/flight/test_auth_handlers.h     |  89 ++++
 cpp/src/arrow/flight/test_definitions.cc      |  15 +-
 cpp/src/arrow/flight/test_flight_server.cc    | 417 +++++++++++++++
 cpp/src/arrow/flight/test_flight_server.h     |  92 ++++
 cpp/src/arrow/flight/test_server.cc           |   3 +-
 cpp/src/arrow/flight/test_util.cc             | 486 +-----------------
 cpp/src/arrow/flight/test_util.h              |  65 ---
 11 files changed, 759 insertions(+), 560 deletions(-)
 create mode 100644 cpp/src/arrow/flight/test_auth_handlers.cc
 create mode 100644 cpp/src/arrow/flight/test_auth_handlers.h
 create mode 100644 cpp/src/arrow/flight/test_flight_server.cc
 create mode 100644 cpp/src/arrow/flight/test_flight_server.h

diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt
index 43ac48b8767..98f93705f6f 100644
--- a/cpp/src/arrow/flight/CMakeLists.txt
+++ b/cpp/src/arrow/flight/CMakeLists.txt
@@ -262,7 +262,9 @@ if(ARROW_TESTING)
                 OUTPUTS
                 ARROW_FLIGHT_TESTING_LIBRARIES
                 SOURCES
+                test_auth_handlers.cc
                 test_definitions.cc
+                test_flight_server.cc
                 test_util.cc
                 DEPENDENCIES
                 flight_grpc_gen
diff --git a/cpp/src/arrow/flight/flight_test.cc b/cpp/src/arrow/flight/flight_test.cc
index 101bb06b212..3d52bc3f5ae 100644
--- a/cpp/src/arrow/flight/flight_test.cc
+++ b/cpp/src/arrow/flight/flight_test.cc
@@ -52,7 +52,9 @@
 // Include before test_util.h (boost), contains Windows fixes
 #include "arrow/flight/platform.h"
 #include "arrow/flight/serialization_internal.h"
+#include "arrow/flight/test_auth_handlers.h"
 #include "arrow/flight/test_definitions.h"
+#include "arrow/flight/test_flight_server.h"
 #include "arrow/flight/test_util.h"
 // OTel includes must come after any gRPC includes, and
 // client_header_internal.h includes gRPC. See:
@@ -247,7 +249,7 @@ TEST(TestFlight, ConnectUriUnix) {
 
 // CI environments don't have an IPv6 interface configured
 TEST(TestFlight, DISABLED_IpV6Port) {
-  std::unique_ptr<FlightServerBase> server = ExampleTestServer();
+  std::unique_ptr<FlightServerBase> server = TestFlightServer::Make();
 
   ASSERT_OK_AND_ASSIGN(auto location, Location::ForGrpcTcp("[::1]", 0));
   FlightServerOptions options(location);
@@ -261,7 +263,7 @@ TEST(TestFlight, DISABLED_IpV6Port) {
 }
 
 TEST(TestFlight, ServerCallContextIncomingHeaders) {
-  auto server = ExampleTestServer();
+  auto server = TestFlightServer::Make();
   ASSERT_OK_AND_ASSIGN(auto location, Location::ForGrpcTcp("localhost", 0));
   FlightServerOptions options(location);
   ASSERT_OK(server->Init(options));
@@ -290,7 +292,7 @@ TEST(TestFlight, ServerCallContextIncomingHeaders) {
 class TestFlightClient : public ::testing::Test {
  public:
   void SetUp() {
-    server_ = ExampleTestServer();
+    server_ = TestFlightServer::Make();
 
     ASSERT_OK_AND_ASSIGN(auto location, Location::ForGrpcTcp("localhost", 0));
     FlightServerOptions options(location);
diff --git a/cpp/src/arrow/flight/integration_tests/test_integration.cc b/cpp/src/arrow/flight/integration_tests/test_integration.cc
index 665c1f1ba03..da6fcf81eb7 100644
--- a/cpp/src/arrow/flight/integration_tests/test_integration.cc
+++ b/cpp/src/arrow/flight/integration_tests/test_integration.cc
@@ -36,6 +36,7 @@
 #include "arrow/flight/sql/server.h"
 #include "arrow/flight/sql/server_session_middleware.h"
 #include "arrow/flight/sql/types.h"
+#include "arrow/flight/test_auth_handlers.h"
 #include "arrow/flight/test_util.h"
 #include "arrow/flight/types.h"
 #include "arrow/ipc/dictionary.h"
diff --git a/cpp/src/arrow/flight/test_auth_handlers.cc b/cpp/src/arrow/flight/test_auth_handlers.cc
new file mode 100644
index 00000000000..856ccf0f2b2
--- /dev/null
+++ b/cpp/src/arrow/flight/test_auth_handlers.cc
@@ -0,0 +1,141 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <string>
+
+#include "arrow/flight/client_auth.h"
+#include "arrow/flight/server.h"
+#include "arrow/flight/server_auth.h"
+#include "arrow/flight/test_auth_handlers.h"
+#include "arrow/flight/types.h"
+#include "arrow/flight/visibility.h"
+#include "arrow/status.h"
+
+namespace arrow::flight {
+
+// TestServerAuthHandler
+
+TestServerAuthHandler::TestServerAuthHandler(const std::string& username,
+                                             const std::string& password)
+    : username_(username), password_(password) {}
+
+TestServerAuthHandler::~TestServerAuthHandler() {}
+
+Status TestServerAuthHandler::Authenticate(const ServerCallContext& context,
+                                           ServerAuthSender* outgoing,
+                                           ServerAuthReader* incoming) {
+  std::string token;
+  RETURN_NOT_OK(incoming->Read(&token));
+  if (token != password_) {
+    return MakeFlightError(FlightStatusCode::Unauthenticated, "Invalid token");
+  }
+  RETURN_NOT_OK(outgoing->Write(username_));
+  return Status::OK();
+}
+
+Status TestServerAuthHandler::IsValid(const ServerCallContext& context,
+                                      const std::string& token,
+                                      std::string* peer_identity) {
+  if (token != password_) {
+    return MakeFlightError(FlightStatusCode::Unauthenticated, "Invalid token");
+  }
+  *peer_identity = username_;
+  return Status::OK();
+}
+
+// TestServerBasicAuthHandler
+
+TestServerBasicAuthHandler::TestServerBasicAuthHandler(const std::string& username,
+                                                       const std::string& password) {
+  basic_auth_.username = username;
+  basic_auth_.password = password;
+}
+
+TestServerBasicAuthHandler::~TestServerBasicAuthHandler() {}
+
+Status TestServerBasicAuthHandler::Authenticate(const ServerCallContext& context,
+                                                ServerAuthSender* outgoing,
+                                                ServerAuthReader* incoming) {
+  std::string token;
+  RETURN_NOT_OK(incoming->Read(&token));
+  ARROW_ASSIGN_OR_RAISE(BasicAuth incoming_auth, BasicAuth::Deserialize(token));
+  if (incoming_auth.username != basic_auth_.username ||
+      incoming_auth.password != basic_auth_.password) {
+    return MakeFlightError(FlightStatusCode::Unauthenticated, "Invalid token");
+  }
+  RETURN_NOT_OK(outgoing->Write(basic_auth_.username));
+  return Status::OK();
+}
+
+Status TestServerBasicAuthHandler::IsValid(const ServerCallContext& context,
+                                           const std::string& token,
+                                           std::string* peer_identity) {
+  if (token != basic_auth_.username) {
+    return MakeFlightError(FlightStatusCode::Unauthenticated, "Invalid token");
+  }
+  *peer_identity = basic_auth_.username;
+  return Status::OK();
+}
+
+// TestClientAuthHandler
+
+TestClientAuthHandler::TestClientAuthHandler(const std::string& username,
+                                             const std::string& password)
+    : username_(username), password_(password) {}
+
+TestClientAuthHandler::~TestClientAuthHandler() {}
+
+Status TestClientAuthHandler::Authenticate(ClientAuthSender* outgoing,
+                                           ClientAuthReader* incoming) {
+  RETURN_NOT_OK(outgoing->Write(password_));
+  std::string username;
+  RETURN_NOT_OK(incoming->Read(&username));
+  if (username != username_) {
+    return MakeFlightError(FlightStatusCode::Unauthenticated, "Invalid token");
+  }
+  return Status::OK();
+}
+
+Status TestClientAuthHandler::GetToken(std::string* token) {
+  *token = password_;
+  return Status::OK();
+}
+
+// TestClientBasicAuthHandler
+
+TestClientBasicAuthHandler::TestClientBasicAuthHandler(const std::string& username,
+                                                       const std::string& password) {
+  basic_auth_.username = username;
+  basic_auth_.password = password;
+}
+
+TestClientBasicAuthHandler::~TestClientBasicAuthHandler() {}
+
+Status TestClientBasicAuthHandler::Authenticate(ClientAuthSender* outgoing,
+                                                ClientAuthReader* incoming) {
+  ARROW_ASSIGN_OR_RAISE(std::string pb_result, basic_auth_.SerializeToString());
+  RETURN_NOT_OK(outgoing->Write(pb_result));
+  RETURN_NOT_OK(incoming->Read(&token_));
+  return Status::OK();
+}
+
+Status TestClientBasicAuthHandler::GetToken(std::string* token) {
+  *token = token_;
+  return Status::OK();
+}
+
+}  // namespace arrow::flight
diff --git a/cpp/src/arrow/flight/test_auth_handlers.h b/cpp/src/arrow/flight/test_auth_handlers.h
new file mode 100644
index 00000000000..74f48798f3b
--- /dev/null
+++ b/cpp/src/arrow/flight/test_auth_handlers.h
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/flight/client_auth.h"
+#include "arrow/flight/server.h"
+#include "arrow/flight/server_auth.h"
+#include "arrow/flight/types.h"
+#include "arrow/flight/visibility.h"
+#include "arrow/status.h"
+
+// A pair of authentication handlers that check for a predefined password
+// and set the peer identity to a predefined username.
+
+namespace arrow::flight {
+
+class ARROW_FLIGHT_EXPORT TestServerAuthHandler : public ServerAuthHandler {
+ public:
+  explicit TestServerAuthHandler(const std::string& username,
+                                 const std::string& password);
+  ~TestServerAuthHandler() override;
+  Status Authenticate(const ServerCallContext& context, ServerAuthSender* outgoing,
+                      ServerAuthReader* incoming) override;
+  Status IsValid(const ServerCallContext& context, const std::string& token,
+                 std::string* peer_identity) override;
+
+ private:
+  std::string username_;
+  std::string password_;
+};
+
+class ARROW_FLIGHT_EXPORT TestServerBasicAuthHandler : public ServerAuthHandler {
+ public:
+  explicit TestServerBasicAuthHandler(const std::string& username,
+                                      const std::string& password);
+  ~TestServerBasicAuthHandler() override;
+  Status Authenticate(const ServerCallContext& context, ServerAuthSender* outgoing,
+                      ServerAuthReader* incoming) override;
+  Status IsValid(const ServerCallContext& context, const std::string& token,
+                 std::string* peer_identity) override;
+
+ private:
+  BasicAuth basic_auth_;
+};
+
+class ARROW_FLIGHT_EXPORT TestClientAuthHandler : public ClientAuthHandler {
+ public:
+  explicit TestClientAuthHandler(const std::string& username,
+                                 const std::string& password);
+  ~TestClientAuthHandler() override;
+  Status Authenticate(ClientAuthSender* outgoing, ClientAuthReader* incoming) override;
+  Status GetToken(std::string* token) override;
+
+ private:
+  std::string username_;
+  std::string password_;
+};
+
+class ARROW_FLIGHT_EXPORT TestClientBasicAuthHandler : public ClientAuthHandler {
+ public:
+  explicit TestClientBasicAuthHandler(const std::string& username,
+                                      const std::string& password);
+  ~TestClientBasicAuthHandler() override;
+  Status Authenticate(ClientAuthSender* outgoing, ClientAuthReader* incoming) override;
+  Status GetToken(std::string* token) override;
+
+ private:
+  BasicAuth basic_auth_;
+  std::string token_;
+};
+
+}  // namespace arrow::flight
diff --git a/cpp/src/arrow/flight/test_definitions.cc b/cpp/src/arrow/flight/test_definitions.cc
index c43b693d84a..273d394c288 100644
--- a/cpp/src/arrow/flight/test_definitions.cc
+++ b/cpp/src/arrow/flight/test_definitions.cc
@@ -27,6 +27,7 @@
 #include "arrow/array/util.h"
 #include "arrow/flight/api.h"
 #include "arrow/flight/client_middleware.h"
+#include "arrow/flight/test_flight_server.h"
 #include "arrow/flight/test_util.h"
 #include "arrow/flight/types.h"
 #include "arrow/flight/types_async.h"
@@ -53,7 +54,7 @@ using arrow::internal::checked_cast;
 // Tests of initialization/shutdown
 
 void ConnectivityTest::TestGetPort() {
-  std::unique_ptr<FlightServerBase> server = ExampleTestServer();
+  std::unique_ptr<FlightServerBase> server = TestFlightServer::Make();
 
   ASSERT_OK_AND_ASSIGN(auto location, Location::ForScheme(transport(), "127.0.0.1", 0));
   FlightServerOptions options(location);
@@ -61,7 +62,7 @@ void ConnectivityTest::TestGetPort() {
   ASSERT_GT(server->port(), 0);
 }
 void ConnectivityTest::TestBuilderHook() {
-  std::unique_ptr<FlightServerBase> server = ExampleTestServer();
+  std::unique_ptr<FlightServerBase> server = TestFlightServer::Make();
 
   ASSERT_OK_AND_ASSIGN(auto location, Location::ForScheme(transport(), "127.0.0.1", 0));
   FlightServerOptions options(location);
@@ -80,7 +81,7 @@ void ConnectivityTest::TestShutdown() {
   constexpr int kIterations = 10;
   ASSERT_OK_AND_ASSIGN(auto location, Location::ForScheme(transport(), "127.0.0.1", 0));
   for (int i = 0; i < kIterations; i++) {
-    std::unique_ptr<FlightServerBase> server = ExampleTestServer();
+    std::unique_ptr<FlightServerBase> server = TestFlightServer::Make();
 
     FlightServerOptions options(location);
     ASSERT_OK(server->Init(options));
@@ -92,7 +93,7 @@ void ConnectivityTest::TestShutdown() {
   }
 }
 void ConnectivityTest::TestShutdownWithDeadline() {
-  std::unique_ptr<FlightServerBase> server = ExampleTestServer();
+  std::unique_ptr<FlightServerBase> server = TestFlightServer::Make();
 
   ASSERT_OK_AND_ASSIGN(auto location, Location::ForScheme(transport(), "127.0.0.1", 0));
   FlightServerOptions options(location);
@@ -105,7 +106,7 @@ void ConnectivityTest::TestShutdownWithDeadline() {
   ASSERT_OK(server->Wait());
 }
 void ConnectivityTest::TestBrokenConnection() {
-  std::unique_ptr<FlightServerBase> server = ExampleTestServer();
+  std::unique_ptr<FlightServerBase> server = TestFlightServer::Make();
   ASSERT_OK_AND_ASSIGN(auto location, Location::ForScheme(transport(), "127.0.0.1", 0));
   FlightServerOptions options(location);
   ASSERT_OK(server->Init(options));
@@ -151,7 +152,7 @@ class GetFlightInfoListener : public AsyncListener<FlightInfo> {
 }  // namespace
 
 void DataTest::SetUpTest() {
-  server_ = ExampleTestServer();
+  server_ = TestFlightServer::Make();
 
   ASSERT_OK_AND_ASSIGN(auto location, Location::ForScheme(transport(), "127.0.0.1", 0));
   FlightServerOptions options(location);
@@ -1822,7 +1823,7 @@ void AsyncClientTest::SetUpTest() {
 
   ASSERT_OK_AND_ASSIGN(auto location, Location::ForScheme(transport(), "127.0.0.1", 0));
 
-  server_ = ExampleTestServer();
+  server_ = TestFlightServer::Make();
   FlightServerOptions server_options(location);
   ASSERT_OK(server_->Init(server_options));
 
diff --git a/cpp/src/arrow/flight/test_flight_server.cc b/cpp/src/arrow/flight/test_flight_server.cc
new file mode 100644
index 00000000000..0ea95ebd15b
--- /dev/null
+++ b/cpp/src/arrow/flight/test_flight_server.cc
@@ -0,0 +1,417 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+
+#include "arrow/flight/test_flight_server.h"
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_primitive.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/flight/server.h"
+#include "arrow/flight/test_util.h"
+#include "arrow/flight/type_fwd.h"
+#include "arrow/status.h"
+
+namespace arrow::flight {
+namespace {
+
+class ErrorRecordBatchReader : public RecordBatchReader {
+ public:
+  ErrorRecordBatchReader() : schema_(arrow::schema({})) {}
+
+  std::shared_ptr<Schema> schema() const override { return schema_; }
+
+  Status ReadNext(std::shared_ptr<RecordBatch>* out) override {
+    *out = nullptr;
+    return Status::OK();
+  }
+
+  Status Close() override {
+    // This should be propagated over DoGet to the client
+    return Status::IOError("Expected error");
+  }
+
+ private:
+  std::shared_ptr<Schema> schema_;
+};
+
+Status GetBatchForFlight(const Ticket& ticket, std::shared_ptr<RecordBatchReader>* out) {
+  if (ticket.ticket == "ticket-ints-1") {
+    RecordBatchVector batches;
+    RETURN_NOT_OK(ExampleIntBatches(&batches));
+    ARROW_ASSIGN_OR_RAISE(*out, RecordBatchReader::Make(batches));
+    return Status::OK();
+  } else if (ticket.ticket == "ticket-floats-1") {
+    RecordBatchVector batches;
+    RETURN_NOT_OK(ExampleFloatBatches(&batches));
+    ARROW_ASSIGN_OR_RAISE(*out, RecordBatchReader::Make(batches));
+    return Status::OK();
+  } else if (ticket.ticket == "ticket-dicts-1") {
+    RecordBatchVector batches;
+    RETURN_NOT_OK(ExampleDictBatches(&batches));
+    ARROW_ASSIGN_OR_RAISE(*out, RecordBatchReader::Make(batches));
+    return Status::OK();
+  } else if (ticket.ticket == "ticket-large-batch-1") {
+    RecordBatchVector batches;
+    RETURN_NOT_OK(ExampleLargeBatches(&batches));
+    ARROW_ASSIGN_OR_RAISE(*out, RecordBatchReader::Make(batches));
+    return Status::OK();
+  } else {
+    return Status::NotImplemented("no stream implemented for ticket: " + ticket.ticket);
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<FlightServerBase> TestFlightServer::Make() {
+  return std::make_unique<TestFlightServer>();
+}
+
+Status TestFlightServer::ListFlights(const ServerCallContext& context,
+                                     const Criteria* criteria,
+                                     std::unique_ptr<FlightListing>* listings) {
+  std::vector<FlightInfo> flights = ExampleFlightInfo();
+  if (criteria && criteria->expression != "") {
+    // For test purposes, if we get criteria, return no results
+    flights.clear();
+  }
+  *listings = std::make_unique<SimpleFlightListing>(flights);
+  return Status::OK();
+}
+
+Status TestFlightServer::GetFlightInfo(const ServerCallContext& context,
+                                       const FlightDescriptor& request,
+                                       std::unique_ptr<FlightInfo>* out) {
+  // Test that Arrow-C++ status codes make it through the transport
+  if (request.type == FlightDescriptor::DescriptorType::CMD &&
+      request.cmd == "status-outofmemory") {
+    return Status::OutOfMemory("Sentinel");
+  }
+
+  std::vector<FlightInfo> flights = ExampleFlightInfo();
+
+  for (const auto& info : flights) {
+    if (info.descriptor().Equals(request)) {
+      *out = std::make_unique<FlightInfo>(info);
+      return Status::OK();
+    }
+  }
+  return Status::Invalid("Flight not found: ", request.ToString());
+}
+
+Status TestFlightServer::DoGet(const ServerCallContext& context, const Ticket& request,
+                               std::unique_ptr<FlightDataStream>* data_stream) {
+  // Test for ARROW-5095
+  if (request.ticket == "ARROW-5095-fail") {
+    return Status::UnknownError("Server-side error");
+  }
+  if (request.ticket == "ARROW-5095-success") {
+    return Status::OK();
+  }
+  if (request.ticket == "ARROW-13253-DoGet-Batch") {
+    // Make batch > 2GiB in size
+    ARROW_ASSIGN_OR_RAISE(auto batch, VeryLargeBatch());
+    ARROW_ASSIGN_OR_RAISE(auto reader, RecordBatchReader::Make({batch}));
+    *data_stream = std::make_unique<RecordBatchStream>(std::move(reader));
+    return Status::OK();
+  }
+  if (request.ticket == "ticket-stream-error") {
+    auto reader = std::make_shared<ErrorRecordBatchReader>();
+    *data_stream = std::make_unique<RecordBatchStream>(std::move(reader));
+    return Status::OK();
+  }
+
+  std::shared_ptr<RecordBatchReader> batch_reader;
+  RETURN_NOT_OK(GetBatchForFlight(request, &batch_reader));
+
+  *data_stream = std::make_unique<RecordBatchStream>(batch_reader);
+  return Status::OK();
+}
+
+Status TestFlightServer::DoPut(const ServerCallContext&,
+                               std::unique_ptr<FlightMessageReader> reader,
+                               std::unique_ptr<FlightMetadataWriter> writer) {
+  return reader->ToRecordBatches().status();
+}
+
+Status TestFlightServer::DoExchange(const ServerCallContext& context,
+                                    std::unique_ptr<FlightMessageReader> reader,
+                                    std::unique_ptr<FlightMessageWriter> writer) {
+  // Test various scenarios for a DoExchange
+  if (reader->descriptor().type != FlightDescriptor::DescriptorType::CMD) {
+    return Status::Invalid("Must provide a command descriptor");
+  }
+
+  const std::string& cmd = reader->descriptor().cmd;
+  if (cmd == "error") {
+    // Immediately return an error to the client.
+    return Status::NotImplemented("Expected error");
+  } else if (cmd == "get") {
+    return RunExchangeGet(std::move(reader), std::move(writer));
+  } else if (cmd == "put") {
+    return RunExchangePut(std::move(reader), std::move(writer));
+  } else if (cmd == "counter") {
+    return RunExchangeCounter(std::move(reader), std::move(writer));
+  } else if (cmd == "total") {
+    return RunExchangeTotal(std::move(reader), std::move(writer));
+  } else if (cmd == "echo") {
+    return RunExchangeEcho(std::move(reader), std::move(writer));
+  } else if (cmd == "large_batch") {
+    return RunExchangeLargeBatch(std::move(reader), std::move(writer));
+  } else if (cmd == "TestUndrained") {
+    ARROW_ASSIGN_OR_RAISE(auto schema, reader->GetSchema());
+    return Status::OK();
+  } else {
+    return Status::NotImplemented("Scenario not implemented: ", cmd);
+  }
+}
+
+// A simple example - act like DoGet.
+Status TestFlightServer::RunExchangeGet(std::unique_ptr<FlightMessageReader> reader,
+                                        std::unique_ptr<FlightMessageWriter> writer) {
+  RETURN_NOT_OK(writer->Begin(ExampleIntSchema()));
+  RecordBatchVector batches;
+  RETURN_NOT_OK(ExampleIntBatches(&batches));
+  for (const auto& batch : batches) {
+    RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
+  }
+  return Status::OK();
+}
+
+// A simple example - act like DoPut
+Status TestFlightServer::RunExchangePut(std::unique_ptr<FlightMessageReader> reader,
+                                        std::unique_ptr<FlightMessageWriter> writer) {
+  ARROW_ASSIGN_OR_RAISE(auto schema, reader->GetSchema());
+  if (!schema->Equals(ExampleIntSchema(), false)) {
+    return Status::Invalid("Schema is not as expected");
+  }
+  RecordBatchVector batches;
+  RETURN_NOT_OK(ExampleIntBatches(&batches));
+  FlightStreamChunk chunk;
+  for (const auto& batch : batches) {
+    ARROW_ASSIGN_OR_RAISE(chunk, reader->Next());
+    if (!chunk.data) {
+      return Status::Invalid("Expected another batch");
+    }
+    if (!batch->Equals(*chunk.data)) {
+      return Status::Invalid("Batch does not match");
+    }
+  }
+  ARROW_ASSIGN_OR_RAISE(chunk, reader->Next());
+  if (chunk.data || chunk.app_metadata) {
+    return Status::Invalid("Too many batches");
+  }
+
+  RETURN_NOT_OK(writer->WriteMetadata(Buffer::FromString("done")));
+  return Status::OK();
+}
+
+// Read some number of record batches from the client, send a
+// metadata message back with the count, then echo the batches back.
+Status TestFlightServer::RunExchangeCounter(std::unique_ptr<FlightMessageReader> reader,
+                                            std::unique_ptr<FlightMessageWriter> writer) {
+  std::vector<std::shared_ptr<RecordBatch>> batches;
+  FlightStreamChunk chunk;
+  int chunks = 0;
+  while (true) {
+    ARROW_ASSIGN_OR_RAISE(chunk, reader->Next());
+    if (!chunk.data && !chunk.app_metadata) {
+      break;
+    }
+    if (chunk.data) {
+      batches.push_back(chunk.data);
+      chunks++;
+    }
+  }
+
+  // Echo back the number of record batches read.
+  std::shared_ptr<Buffer> buf = Buffer::FromString(std::to_string(chunks));
+  RETURN_NOT_OK(writer->WriteMetadata(buf));
+  // Echo the record batches themselves.
+  if (chunks > 0) {
+    ARROW_ASSIGN_OR_RAISE(auto schema, reader->GetSchema());
+    RETURN_NOT_OK(writer->Begin(schema));
+
+    for (const auto& batch : batches) {
+      RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
+    }
+  }
+
+  return Status::OK();
+}
+
+// Read int64 batches from the client, each time sending back a
+// batch with a running sum of columns.
+Status TestFlightServer::RunExchangeTotal(std::unique_ptr<FlightMessageReader> reader,
+                                          std::unique_ptr<FlightMessageWriter> writer) {
+  FlightStreamChunk chunk{};
+  ARROW_ASSIGN_OR_RAISE(auto schema, reader->GetSchema());
+  // Ensure the schema contains only int64 columns
+  for (const auto& field : schema->fields()) {
+    if (field->type()->id() != Type::type::INT64) {
+      return Status::Invalid("Field is not INT64: ", field->name());
+    }
+  }
+  std::vector<int64_t> sums(schema->num_fields());
+  std::vector<std::shared_ptr<Array>> columns(schema->num_fields());
+  RETURN_NOT_OK(writer->Begin(schema));
+  while (true) {
+    ARROW_ASSIGN_OR_RAISE(chunk, reader->Next());
+    if (!chunk.data && !chunk.app_metadata) {
+      break;
+    }
+    if (chunk.data) {
+      if (!chunk.data->schema()->Equals(schema, false)) {
+        // A compliant client implementation would make this impossible
+        return Status::Invalid("Schemas are incompatible");
+      }
+
+      // Update the running totals
+      auto builder = std::make_shared<Int64Builder>();
+      int col_index = 0;
+      for (const auto& column : chunk.data->columns()) {
+        auto arr = std::dynamic_pointer_cast<Int64Array>(column);
+        if (!arr) {
+          return MakeFlightError(FlightStatusCode::Internal, "Could not cast array");
+        }
+        for (int row = 0; row < column->length(); row++) {
+          if (!arr->IsNull(row)) {
+            sums[col_index] += arr->Value(row);
+          }
+        }
+
+        builder->Reset();
+        RETURN_NOT_OK(builder->Append(sums[col_index]));
+        RETURN_NOT_OK(builder->Finish(&columns[col_index]));
+
+        col_index++;
+      }
+
+      // Echo the totals to the client
+      auto response = RecordBatch::Make(schema, /* num_rows */ 1, columns);
+      RETURN_NOT_OK(writer->WriteRecordBatch(*response));
+    }
+  }
+  return Status::OK();
+}
+
+// Echo the client's messages back.
+Status TestFlightServer::RunExchangeEcho(std::unique_ptr<FlightMessageReader> reader,
+                                         std::unique_ptr<FlightMessageWriter> writer) {
+  FlightStreamChunk chunk;
+  bool begun = false;
+  while (true) {
+    ARROW_ASSIGN_OR_RAISE(chunk, reader->Next());
+    if (!chunk.data && !chunk.app_metadata) {
+      break;
+    }
+    if (!begun && chunk.data) {
+      begun = true;
+      RETURN_NOT_OK(writer->Begin(chunk.data->schema()));
+    }
+    if (chunk.data && chunk.app_metadata) {
+      RETURN_NOT_OK(writer->WriteWithMetadata(*chunk.data, chunk.app_metadata));
+    } else if (chunk.data) {
+      RETURN_NOT_OK(writer->WriteRecordBatch(*chunk.data));
+    } else if (chunk.app_metadata) {
+      RETURN_NOT_OK(writer->WriteMetadata(chunk.app_metadata));
+    }
+  }
+  return Status::OK();
+}
+
+// Regression test for ARROW-13253
+Status TestFlightServer::RunExchangeLargeBatch(
+    std::unique_ptr<FlightMessageReader>, std::unique_ptr<FlightMessageWriter> writer) {
+  ARROW_ASSIGN_OR_RAISE(auto batch, VeryLargeBatch());
+  RETURN_NOT_OK(writer->Begin(batch->schema()));
+  return writer->WriteRecordBatch(*batch);
+}
+
+Status TestFlightServer::RunAction1(const Action& action,
+                                    std::unique_ptr<ResultStream>* out) {
+  std::vector<Result> results;
+  for (int i = 0; i < 3; ++i) {
+    Result result;
+    std::string value = action.body->ToString() + "-part" + std::to_string(i);
+    result.body = Buffer::FromString(std::move(value));
+    results.push_back(result);
+  }
+  *out = std::make_unique<SimpleResultStream>(std::move(results));
+  return Status::OK();
+}
+
+Status TestFlightServer::RunAction2(std::unique_ptr<ResultStream>* out) {
+  // Empty
+  *out = std::make_unique<SimpleResultStream>(std::vector<Result>{});
+  return Status::OK();
+}
+
+Status TestFlightServer::ListIncomingHeaders(const ServerCallContext& context,
+                                             const Action& action,
+                                             std::unique_ptr<ResultStream>* out) {
+  std::vector<Result> results;
+  std::string_view prefix(*action.body);
+  for (const auto& header : context.incoming_headers()) {
+    if (header.first.substr(0, prefix.size()) != prefix) {
+      continue;
+    }
+    Result result;
+    result.body =
+        Buffer::FromString(std::string(header.first) + ": " + std::string(header.second));
+    results.push_back(result);
+  }
+  *out = std::make_unique<SimpleResultStream>(std::move(results));
+  return Status::OK();
+}
+
+Status TestFlightServer::DoAction(const ServerCallContext& context, const Action& action,
+                                  std::unique_ptr<ResultStream>* out) {
+  if (action.type == "action1") {
+    return RunAction1(action, out);
+  } else if (action.type == "action2") {
+    return RunAction2(out);
+  } else if (action.type == "list-incoming-headers") {
+    return ListIncomingHeaders(context, action, out);
+  } else {
+    return Status::NotImplemented(action.type);
+  }
+}
+
+Status TestFlightServer::ListActions(const ServerCallContext& context,
+                                     std::vector<ActionType>* out) {
+  std::vector<ActionType> actions = ExampleActionTypes();
+  *out = std::move(actions);
+  return Status::OK();
+}
+
+Status TestFlightServer::GetSchema(const ServerCallContext& context,
+                                   const FlightDescriptor& request,
+                                   std::unique_ptr<SchemaResult>* schema) {
+  std::vector<FlightInfo> flights = ExampleFlightInfo();
+
+  for (const auto& info : flights) {
+    if (info.descriptor().Equals(request)) {
+      *schema = std::make_unique<SchemaResult>(info.serialized_schema());
+      return Status::OK();
+    }
+  }
+  return Status::Invalid("Flight not found: ", request.ToString());
+}
+
+}  // namespace arrow::flight
diff --git a/cpp/src/arrow/flight/test_flight_server.h b/cpp/src/arrow/flight/test_flight_server.h
new file mode 100644
index 00000000000..794dd834c01
--- /dev/null
+++ b/cpp/src/arrow/flight/test_flight_server.h
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/flight/server.h"
+#include "arrow/flight/type_fwd.h"
+#include "arrow/flight/visibility.h"
+#include "arrow/status.h"
+
+namespace arrow::flight {
+
+class ARROW_FLIGHT_EXPORT TestFlightServer : public FlightServerBase {
+ public:
+  static std::unique_ptr<FlightServerBase> Make();
+
+  Status ListFlights(const ServerCallContext& context, const Criteria* criteria,
+                     std::unique_ptr<FlightListing>* listings) override;
+
+  Status GetFlightInfo(const ServerCallContext& context, const FlightDescriptor& request,
+                       std::unique_ptr<FlightInfo>* out) override;
+
+  Status DoGet(const ServerCallContext& context, const Ticket& request,
+               std::unique_ptr<FlightDataStream>* data_stream) override;
+
+  Status DoPut(const ServerCallContext&, std::unique_ptr<FlightMessageReader> reader,
+               std::unique_ptr<FlightMetadataWriter> writer) override;
+
+  Status DoExchange(const ServerCallContext& context,
+                    std::unique_ptr<FlightMessageReader> reader,
+                    std::unique_ptr<FlightMessageWriter> writer) override;
+
+  // A simple example - act like DoGet.
+  Status RunExchangeGet(std::unique_ptr<FlightMessageReader> reader,
+                        std::unique_ptr<FlightMessageWriter> writer);
+
+  // A simple example - act like DoPut
+  Status RunExchangePut(std::unique_ptr<FlightMessageReader> reader,
+                        std::unique_ptr<FlightMessageWriter> writer);
+
+  // Read some number of record batches from the client, send a
+  // metadata message back with the count, then echo the batches back.
+  Status RunExchangeCounter(std::unique_ptr<FlightMessageReader> reader,
+                            std::unique_ptr<FlightMessageWriter> writer);
+
+  // Read int64 batches from the client, each time sending back a
+  // batch with a running sum of columns.
+  Status RunExchangeTotal(std::unique_ptr<FlightMessageReader> reader,
+                          std::unique_ptr<FlightMessageWriter> writer);
+
+  // Echo the client's messages back.
+  Status RunExchangeEcho(std::unique_ptr<FlightMessageReader> reader,
+                         std::unique_ptr<FlightMessageWriter> writer);
+
+  // Regression test for ARROW-13253
+  Status RunExchangeLargeBatch(std::unique_ptr<FlightMessageReader>,
+                               std::unique_ptr<FlightMessageWriter> writer);
+
+  Status RunAction1(const Action& action, std::unique_ptr<ResultStream>* out);
+
+  Status RunAction2(std::unique_ptr<ResultStream>* out);
+
+  Status ListIncomingHeaders(const ServerCallContext& context, const Action& action,
+                             std::unique_ptr<ResultStream>* out);
+
+  Status DoAction(const ServerCallContext& context, const Action& action,
+                  std::unique_ptr<ResultStream>* out) override;
+
+  Status ListActions(const ServerCallContext& context,
+                     std::vector<ActionType>* out) override;
+
+  Status GetSchema(const ServerCallContext& context, const FlightDescriptor& request,
+                   std::unique_ptr<SchemaResult>* schema) override;
+};
+
+}  // namespace arrow::flight
diff --git a/cpp/src/arrow/flight/test_server.cc b/cpp/src/arrow/flight/test_server.cc
index 18bf2b41359..ba84b8f532e 100644
--- a/cpp/src/arrow/flight/test_server.cc
+++ b/cpp/src/arrow/flight/test_server.cc
@@ -26,6 +26,7 @@
 #include <gflags/gflags.h>
 
 #include "arrow/flight/server.h"
+#include "arrow/flight/test_flight_server.h"
 #include "arrow/flight/test_util.h"
 #include "arrow/flight/types.h"
 #include "arrow/util/logging.h"
@@ -38,7 +39,7 @@ std::unique_ptr<arrow::flight::FlightServerBase> g_server;
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
-  g_server = arrow::flight::ExampleTestServer();
+  g_server = arrow::flight::TestFlightServer::Make();
 
   arrow::flight::Location location;
   if (FLAGS_unix.empty()) {
diff --git a/cpp/src/arrow/flight/test_util.cc b/cpp/src/arrow/flight/test_util.cc
index 8b4245e74e8..127827ff38c 100644
--- a/cpp/src/arrow/flight/test_util.cc
+++ b/cpp/src/arrow/flight/test_util.cc
@@ -49,8 +49,7 @@
 #include "arrow/flight/api.h"
 #include "arrow/flight/serialization_internal.h"
 
-namespace arrow {
-namespace flight {
+namespace arrow::flight {
 
 namespace bp = boost::process;
 namespace fs = boost::filesystem;
@@ -90,25 +89,6 @@ Status ResolveCurrentExecutable(fs::path* out) {
   }
 }
 
-class ErrorRecordBatchReader : public RecordBatchReader {
- public:
-  ErrorRecordBatchReader() : schema_(arrow::schema({})) {}
-
-  std::shared_ptr<Schema> schema() const override { return schema_; }
-
-  Status ReadNext(std::shared_ptr<RecordBatch>* out) override {
-    *out = nullptr;
-    return Status::OK();
-  }
-
-  Status Close() override {
-    // This should be propagated over DoGet to the client
-    return Status::IOError("Expected error");
-  }
-
- private:
-  std::shared_ptr<Schema> schema_;
-};
 }  // namespace
 
 void TestServer::Start(const std::vector<std::string>& extra_args) {
@@ -171,364 +151,6 @@ int TestServer::port() const { return port_; }
 
 const std::string& TestServer::unix_sock() const { return unix_sock_; }
 
-Status GetBatchForFlight(const Ticket& ticket, std::shared_ptr<RecordBatchReader>* out) {
-  if (ticket.ticket == "ticket-ints-1") {
-    RecordBatchVector batches;
-    RETURN_NOT_OK(ExampleIntBatches(&batches));
-    ARROW_ASSIGN_OR_RAISE(*out, RecordBatchReader::Make(batches));
-    return Status::OK();
-  } else if (ticket.ticket == "ticket-floats-1") {
-    RecordBatchVector batches;
-    RETURN_NOT_OK(ExampleFloatBatches(&batches));
-    ARROW_ASSIGN_OR_RAISE(*out, RecordBatchReader::Make(batches));
-    return Status::OK();
-  } else if (ticket.ticket == "ticket-dicts-1") {
-    RecordBatchVector batches;
-    RETURN_NOT_OK(ExampleDictBatches(&batches));
-    ARROW_ASSIGN_OR_RAISE(*out, RecordBatchReader::Make(batches));
-    return Status::OK();
-  } else if (ticket.ticket == "ticket-large-batch-1") {
-    RecordBatchVector batches;
-    RETURN_NOT_OK(ExampleLargeBatches(&batches));
-    ARROW_ASSIGN_OR_RAISE(*out, RecordBatchReader::Make(batches));
-    return Status::OK();
-  } else {
-    return Status::NotImplemented("no stream implemented for ticket: " + ticket.ticket);
-  }
-}
-
-class FlightTestServer : public FlightServerBase {
-  Status ListFlights(const ServerCallContext& context, const Criteria* criteria,
-                     std::unique_ptr<FlightListing>* listings) override {
-    std::vector<FlightInfo> flights = ExampleFlightInfo();
-    if (criteria && criteria->expression != "") {
-      // For test purposes, if we get criteria, return no results
-      flights.clear();
-    }
-    *listings = std::make_unique<SimpleFlightListing>(flights);
-    return Status::OK();
-  }
-
-  Status GetFlightInfo(const ServerCallContext& context, const FlightDescriptor& request,
-                       std::unique_ptr<FlightInfo>* out) override {
-    // Test that Arrow-C++ status codes make it through the transport
-    if (request.type == FlightDescriptor::DescriptorType::CMD &&
-        request.cmd == "status-outofmemory") {
-      return Status::OutOfMemory("Sentinel");
-    }
-
-    std::vector<FlightInfo> flights = ExampleFlightInfo();
-
-    for (const auto& info : flights) {
-      if (info.descriptor().Equals(request)) {
-        *out = std::make_unique<FlightInfo>(info);
-        return Status::OK();
-      }
-    }
-    return Status::Invalid("Flight not found: ", request.ToString());
-  }
-
-  Status DoGet(const ServerCallContext& context, const Ticket& request,
-               std::unique_ptr<FlightDataStream>* data_stream) override {
-    // Test for ARROW-5095
-    if (request.ticket == "ARROW-5095-fail") {
-      return Status::UnknownError("Server-side error");
-    }
-    if (request.ticket == "ARROW-5095-success") {
-      return Status::OK();
-    }
-    if (request.ticket == "ARROW-13253-DoGet-Batch") {
-      // Make batch > 2GiB in size
-      ARROW_ASSIGN_OR_RAISE(auto batch, VeryLargeBatch());
-      ARROW_ASSIGN_OR_RAISE(auto reader, RecordBatchReader::Make({batch}));
-      *data_stream = std::make_unique<RecordBatchStream>(std::move(reader));
-      return Status::OK();
-    }
-    if (request.ticket == "ticket-stream-error") {
-      auto reader = std::make_shared<ErrorRecordBatchReader>();
-      *data_stream = std::make_unique<RecordBatchStream>(std::move(reader));
-      return Status::OK();
-    }
-
-    std::shared_ptr<RecordBatchReader> batch_reader;
-    RETURN_NOT_OK(GetBatchForFlight(request, &batch_reader));
-
-    *data_stream = std::make_unique<RecordBatchStream>(batch_reader);
-    return Status::OK();
-  }
-
-  Status DoPut(const ServerCallContext&, std::unique_ptr<FlightMessageReader> reader,
-               std::unique_ptr<FlightMetadataWriter> writer) override {
-    return reader->ToRecordBatches().status();
-  }
-
-  Status DoExchange(const ServerCallContext& context,
-                    std::unique_ptr<FlightMessageReader> reader,
-                    std::unique_ptr<FlightMessageWriter> writer) override {
-    // Test various scenarios for a DoExchange
-    if (reader->descriptor().type != FlightDescriptor::DescriptorType::CMD) {
-      return Status::Invalid("Must provide a command descriptor");
-    }
-
-    const std::string& cmd = reader->descriptor().cmd;
-    if (cmd == "error") {
-      // Immediately return an error to the client.
-      return Status::NotImplemented("Expected error");
-    } else if (cmd == "get") {
-      return RunExchangeGet(std::move(reader), std::move(writer));
-    } else if (cmd == "put") {
-      return RunExchangePut(std::move(reader), std::move(writer));
-    } else if (cmd == "counter") {
-      return RunExchangeCounter(std::move(reader), std::move(writer));
-    } else if (cmd == "total") {
-      return RunExchangeTotal(std::move(reader), std::move(writer));
-    } else if (cmd == "echo") {
-      return RunExchangeEcho(std::move(reader), std::move(writer));
-    } else if (cmd == "large_batch") {
-      return RunExchangeLargeBatch(std::move(reader), std::move(writer));
-    } else if (cmd == "TestUndrained") {
-      ARROW_ASSIGN_OR_RAISE(auto schema, reader->GetSchema());
-      return Status::OK();
-    } else {
-      return Status::NotImplemented("Scenario not implemented: ", cmd);
-    }
-  }
-
-  // A simple example - act like DoGet.
-  Status RunExchangeGet(std::unique_ptr<FlightMessageReader> reader,
-                        std::unique_ptr<FlightMessageWriter> writer) {
-    RETURN_NOT_OK(writer->Begin(ExampleIntSchema()));
-    RecordBatchVector batches;
-    RETURN_NOT_OK(ExampleIntBatches(&batches));
-    for (const auto& batch : batches) {
-      RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
-    }
-    return Status::OK();
-  }
-
-  // A simple example - act like DoPut
-  Status RunExchangePut(std::unique_ptr<FlightMessageReader> reader,
-                        std::unique_ptr<FlightMessageWriter> writer) {
-    ARROW_ASSIGN_OR_RAISE(auto schema, reader->GetSchema());
-    if (!schema->Equals(ExampleIntSchema(), false)) {
-      return Status::Invalid("Schema is not as expected");
-    }
-    RecordBatchVector batches;
-    RETURN_NOT_OK(ExampleIntBatches(&batches));
-    FlightStreamChunk chunk;
-    for (const auto& batch : batches) {
-      ARROW_ASSIGN_OR_RAISE(chunk, reader->Next());
-      if (!chunk.data) {
-        return Status::Invalid("Expected another batch");
-      }
-      if (!batch->Equals(*chunk.data)) {
-        return Status::Invalid("Batch does not match");
-      }
-    }
-    ARROW_ASSIGN_OR_RAISE(chunk, reader->Next());
-    if (chunk.data || chunk.app_metadata) {
-      return Status::Invalid("Too many batches");
-    }
-
-    RETURN_NOT_OK(writer->WriteMetadata(Buffer::FromString("done")));
-    return Status::OK();
-  }
-
-  // Read some number of record batches from the client, send a
-  // metadata message back with the count, then echo the batches back.
-  Status RunExchangeCounter(std::unique_ptr<FlightMessageReader> reader,
-                            std::unique_ptr<FlightMessageWriter> writer) {
-    std::vector<std::shared_ptr<RecordBatch>> batches;
-    FlightStreamChunk chunk;
-    int chunks = 0;
-    while (true) {
-      ARROW_ASSIGN_OR_RAISE(chunk, reader->Next());
-      if (!chunk.data && !chunk.app_metadata) {
-        break;
-      }
-      if (chunk.data) {
-        batches.push_back(chunk.data);
-        chunks++;
-      }
-    }
-
-    // Echo back the number of record batches read.
-    std::shared_ptr<Buffer> buf = Buffer::FromString(std::to_string(chunks));
-    RETURN_NOT_OK(writer->WriteMetadata(buf));
-    // Echo the record batches themselves.
-    if (chunks > 0) {
-      ARROW_ASSIGN_OR_RAISE(auto schema, reader->GetSchema());
-      RETURN_NOT_OK(writer->Begin(schema));
-
-      for (const auto& batch : batches) {
-        RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
-      }
-    }
-
-    return Status::OK();
-  }
-
-  // Read int64 batches from the client, each time sending back a
-  // batch with a running sum of columns.
-  Status RunExchangeTotal(std::unique_ptr<FlightMessageReader> reader,
-                          std::unique_ptr<FlightMessageWriter> writer) {
-    FlightStreamChunk chunk{};
-    ARROW_ASSIGN_OR_RAISE(auto schema, reader->GetSchema());
-    // Ensure the schema contains only int64 columns
-    for (const auto& field : schema->fields()) {
-      if (field->type()->id() != Type::type::INT64) {
-        return Status::Invalid("Field is not INT64: ", field->name());
-      }
-    }
-    std::vector<int64_t> sums(schema->num_fields());
-    std::vector<std::shared_ptr<Array>> columns(schema->num_fields());
-    RETURN_NOT_OK(writer->Begin(schema));
-    while (true) {
-      ARROW_ASSIGN_OR_RAISE(chunk, reader->Next());
-      if (!chunk.data && !chunk.app_metadata) {
-        break;
-      }
-      if (chunk.data) {
-        if (!chunk.data->schema()->Equals(schema, false)) {
-          // A compliant client implementation would make this impossible
-          return Status::Invalid("Schemas are incompatible");
-        }
-
-        // Update the running totals
-        auto builder = std::make_shared<Int64Builder>();
-        int col_index = 0;
-        for (const auto& column : chunk.data->columns()) {
-          auto arr = std::dynamic_pointer_cast<Int64Array>(column);
-          if (!arr) {
-            return MakeFlightError(FlightStatusCode::Internal, "Could not cast array");
-          }
-          for (int row = 0; row < column->length(); row++) {
-            if (!arr->IsNull(row)) {
-              sums[col_index] += arr->Value(row);
-            }
-          }
-
-          builder->Reset();
-          RETURN_NOT_OK(builder->Append(sums[col_index]));
-          RETURN_NOT_OK(builder->Finish(&columns[col_index]));
-
-          col_index++;
-        }
-
-        // Echo the totals to the client
-        auto response = RecordBatch::Make(schema, /* num_rows */ 1, columns);
-        RETURN_NOT_OK(writer->WriteRecordBatch(*response));
-      }
-    }
-    return Status::OK();
-  }
-
-  // Echo the client's messages back.
-  Status RunExchangeEcho(std::unique_ptr<FlightMessageReader> reader,
-                         std::unique_ptr<FlightMessageWriter> writer) {
-    FlightStreamChunk chunk;
-    bool begun = false;
-    while (true) {
-      ARROW_ASSIGN_OR_RAISE(chunk, reader->Next());
-      if (!chunk.data && !chunk.app_metadata) {
-        break;
-      }
-      if (!begun && chunk.data) {
-        begun = true;
-        RETURN_NOT_OK(writer->Begin(chunk.data->schema()));
-      }
-      if (chunk.data && chunk.app_metadata) {
-        RETURN_NOT_OK(writer->WriteWithMetadata(*chunk.data, chunk.app_metadata));
-      } else if (chunk.data) {
-        RETURN_NOT_OK(writer->WriteRecordBatch(*chunk.data));
-      } else if (chunk.app_metadata) {
-        RETURN_NOT_OK(writer->WriteMetadata(chunk.app_metadata));
-      }
-    }
-    return Status::OK();
-  }
-
-  // Regression test for ARROW-13253
-  Status RunExchangeLargeBatch(std::unique_ptr<FlightMessageReader>,
-                               std::unique_ptr<FlightMessageWriter> writer) {
-    ARROW_ASSIGN_OR_RAISE(auto batch, VeryLargeBatch());
-    RETURN_NOT_OK(writer->Begin(batch->schema()));
-    return writer->WriteRecordBatch(*batch);
-  }
-
-  Status RunAction1(const Action& action, std::unique_ptr<ResultStream>* out) {
-    std::vector<Result> results;
-    for (int i = 0; i < 3; ++i) {
-      Result result;
-      std::string value = action.body->ToString() + "-part" + std::to_string(i);
-      result.body = Buffer::FromString(std::move(value));
-      results.push_back(result);
-    }
-    *out = std::make_unique<SimpleResultStream>(std::move(results));
-    return Status::OK();
-  }
-
-  Status RunAction2(std::unique_ptr<ResultStream>* out) {
-    // Empty
-    *out = std::make_unique<SimpleResultStream>(std::vector<Result>{});
-    return Status::OK();
-  }
-
-  Status ListIncomingHeaders(const ServerCallContext& context, const Action& action,
-                             std::unique_ptr<ResultStream>* out) {
-    std::vector<Result> results;
-    std::string_view prefix(*action.body);
-    for (const auto& header : context.incoming_headers()) {
-      if (header.first.substr(0, prefix.size()) != prefix) {
-        continue;
-      }
-      Result result;
-      result.body = Buffer::FromString(std::string(header.first) + ": " +
-                                       std::string(header.second));
-      results.push_back(result);
-    }
-    *out = std::make_unique<SimpleResultStream>(std::move(results));
-    return Status::OK();
-  }
-
-  Status DoAction(const ServerCallContext& context, const Action& action,
-                  std::unique_ptr<ResultStream>* out) override {
-    if (action.type == "action1") {
-      return RunAction1(action, out);
-    } else if (action.type == "action2") {
-      return RunAction2(out);
-    } else if (action.type == "list-incoming-headers") {
-      return ListIncomingHeaders(context, action, out);
-    } else {
-      return Status::NotImplemented(action.type);
-    }
-  }
-
-  Status ListActions(const ServerCallContext& context,
-                     std::vector<ActionType>* out) override {
-    std::vector<ActionType> actions = ExampleActionTypes();
-    *out = std::move(actions);
-    return Status::OK();
-  }
-
-  Status GetSchema(const ServerCallContext& context, const FlightDescriptor& request,
-                   std::unique_ptr<SchemaResult>* schema) override {
-    std::vector<FlightInfo> flights = ExampleFlightInfo();
-
-    for (const auto& info : flights) {
-      if (info.descriptor().Equals(request)) {
-        *schema = std::make_unique<SchemaResult>(info.serialized_schema());
-        return Status::OK();
-      }
-    }
-    return Status::Invalid("Flight not found: ", request.ToString());
-  }
-};
-
-std::unique_ptr<FlightServerBase> ExampleTestServer() {
-  return std::make_unique<FlightTestServer>();
-}
-
 FlightInfo MakeFlightInfo(const Schema& schema, const FlightDescriptor& descriptor,
                           const std::vector<FlightEndpoint>& endpoints,
                           int64_t total_records, int64_t total_bytes, bool ordered,
@@ -701,109 +323,6 @@ std::vector<ActionType> ExampleActionTypes() {
   return {{"drop", "drop a dataset"}, {"cache", "cache a dataset"}};
 }
 
-TestServerAuthHandler::TestServerAuthHandler(const std::string& username,
-                                             const std::string& password)
-    : username_(username), password_(password) {}
-
-TestServerAuthHandler::~TestServerAuthHandler() {}
-
-Status TestServerAuthHandler::Authenticate(const ServerCallContext& context,
-                                           ServerAuthSender* outgoing,
-                                           ServerAuthReader* incoming) {
-  std::string token;
-  RETURN_NOT_OK(incoming->Read(&token));
-  if (token != password_) {
-    return MakeFlightError(FlightStatusCode::Unauthenticated, "Invalid token");
-  }
-  RETURN_NOT_OK(outgoing->Write(username_));
-  return Status::OK();
-}
-
-Status TestServerAuthHandler::IsValid(const ServerCallContext& context,
-                                      const std::string& token,
-                                      std::string* peer_identity) {
-  if (token != password_) {
-    return MakeFlightError(FlightStatusCode::Unauthenticated, "Invalid token");
-  }
-  *peer_identity = username_;
-  return Status::OK();
-}
-
-TestServerBasicAuthHandler::TestServerBasicAuthHandler(const std::string& username,
-                                                       const std::string& password) {
-  basic_auth_.username = username;
-  basic_auth_.password = password;
-}
-
-TestServerBasicAuthHandler::~TestServerBasicAuthHandler() {}
-
-Status TestServerBasicAuthHandler::Authenticate(const ServerCallContext& context,
-                                                ServerAuthSender* outgoing,
-                                                ServerAuthReader* incoming) {
-  std::string token;
-  RETURN_NOT_OK(incoming->Read(&token));
-  ARROW_ASSIGN_OR_RAISE(BasicAuth incoming_auth, BasicAuth::Deserialize(token));
-  if (incoming_auth.username != basic_auth_.username ||
-      incoming_auth.password != basic_auth_.password) {
-    return MakeFlightError(FlightStatusCode::Unauthenticated, "Invalid token");
-  }
-  RETURN_NOT_OK(outgoing->Write(basic_auth_.username));
-  return Status::OK();
-}
-
-Status TestServerBasicAuthHandler::IsValid(const ServerCallContext& context,
-                                           const std::string& token,
-                                           std::string* peer_identity) {
-  if (token != basic_auth_.username) {
-    return MakeFlightError(FlightStatusCode::Unauthenticated, "Invalid token");
-  }
-  *peer_identity = basic_auth_.username;
-  return Status::OK();
-}
-
-TestClientAuthHandler::TestClientAuthHandler(const std::string& username,
-                                             const std::string& password)
-    : username_(username), password_(password) {}
-
-TestClientAuthHandler::~TestClientAuthHandler() {}
-
-Status TestClientAuthHandler::Authenticate(ClientAuthSender* outgoing,
-                                           ClientAuthReader* incoming) {
-  RETURN_NOT_OK(outgoing->Write(password_));
-  std::string username;
-  RETURN_NOT_OK(incoming->Read(&username));
-  if (username != username_) {
-    return MakeFlightError(FlightStatusCode::Unauthenticated, "Invalid token");
-  }
-  return Status::OK();
-}
-
-Status TestClientAuthHandler::GetToken(std::string* token) {
-  *token = password_;
-  return Status::OK();
-}
-
-TestClientBasicAuthHandler::TestClientBasicAuthHandler(const std::string& username,
-                                                       const std::string& password) {
-  basic_auth_.username = username;
-  basic_auth_.password = password;
-}
-
-TestClientBasicAuthHandler::~TestClientBasicAuthHandler() {}
-
-Status TestClientBasicAuthHandler::Authenticate(ClientAuthSender* outgoing,
-                                                ClientAuthReader* incoming) {
-  ARROW_ASSIGN_OR_RAISE(std::string pb_result, basic_auth_.SerializeToString());
-  RETURN_NOT_OK(outgoing->Write(pb_result));
-  RETURN_NOT_OK(incoming->Read(&token_));
-  return Status::OK();
-}
-
-Status TestClientBasicAuthHandler::GetToken(std::string* token) {
-  *token = token_;
-  return Status::OK();
-}
-
 Status ExampleTlsCertificates(std::vector<CertKeyPair>* out) {
   std::string root;
   RETURN_NOT_OK(GetTestResourceRoot(&root));
@@ -860,5 +379,4 @@ Status ExampleTlsCertificateRoot(CertKeyPair* out) {
   }
 }
 
-}  // namespace flight
-}  // namespace arrow
+}  // namespace arrow::flight
diff --git a/cpp/src/arrow/flight/test_util.h b/cpp/src/arrow/flight/test_util.h
index c0b42d9b90c..15ba6145ecd 100644
--- a/cpp/src/arrow/flight/test_util.h
+++ b/cpp/src/arrow/flight/test_util.h
@@ -32,9 +32,7 @@
 #include "arrow/testing/util.h"
 
 #include "arrow/flight/client.h"
-#include "arrow/flight/client_auth.h"
 #include "arrow/flight/server.h"
-#include "arrow/flight/server_auth.h"
 #include "arrow/flight/types.h"
 #include "arrow/flight/visibility.h"
 
@@ -95,10 +93,6 @@ class ARROW_FLIGHT_EXPORT TestServer {
   std::shared_ptr<::boost::process::child> server_process_;
 };
 
-/// \brief Create a simple Flight server for testing
-ARROW_FLIGHT_EXPORT
-std::unique_ptr<FlightServerBase> ExampleTestServer();
-
 // Helper to initialize a server and matching client with callbacks to
 // populate options.
 template <typename T, typename... Args>
@@ -195,65 +189,6 @@ FlightInfo MakeFlightInfo(const Schema& schema, const FlightDescriptor& descript
                           int64_t total_records, int64_t total_bytes, bool ordered,
                           std::string app_metadata);
 
-// ----------------------------------------------------------------------
-// A pair of authentication handlers that check for a predefined password
-// and set the peer identity to a predefined username.
-
-class ARROW_FLIGHT_EXPORT TestServerAuthHandler : public ServerAuthHandler {
- public:
-  explicit TestServerAuthHandler(const std::string& username,
-                                 const std::string& password);
-  ~TestServerAuthHandler() override;
-  Status Authenticate(const ServerCallContext& context, ServerAuthSender* outgoing,
-                      ServerAuthReader* incoming) override;
-  Status IsValid(const ServerCallContext& context, const std::string& token,
-                 std::string* peer_identity) override;
-
- private:
-  std::string username_;
-  std::string password_;
-};
-
-class ARROW_FLIGHT_EXPORT TestServerBasicAuthHandler : public ServerAuthHandler {
- public:
-  explicit TestServerBasicAuthHandler(const std::string& username,
-                                      const std::string& password);
-  ~TestServerBasicAuthHandler() override;
-  Status Authenticate(const ServerCallContext& context, ServerAuthSender* outgoing,
-                      ServerAuthReader* incoming) override;
-  Status IsValid(const ServerCallContext& context, const std::string& token,
-                 std::string* peer_identity) override;
-
- private:
-  BasicAuth basic_auth_;
-};
-
-class ARROW_FLIGHT_EXPORT TestClientAuthHandler : public ClientAuthHandler {
- public:
-  explicit TestClientAuthHandler(const std::string& username,
-                                 const std::string& password);
-  ~TestClientAuthHandler() override;
-  Status Authenticate(ClientAuthSender* outgoing, ClientAuthReader* incoming) override;
-  Status GetToken(std::string* token) override;
-
- private:
-  std::string username_;
-  std::string password_;
-};
-
-class ARROW_FLIGHT_EXPORT TestClientBasicAuthHandler : public ClientAuthHandler {
- public:
-  explicit TestClientBasicAuthHandler(const std::string& username,
-                                      const std::string& password);
-  ~TestClientBasicAuthHandler() override;
-  Status Authenticate(ClientAuthSender* outgoing, ClientAuthReader* incoming) override;
-  Status GetToken(std::string* token) override;
-
- private:
-  BasicAuth basic_auth_;
-  std::string token_;
-};
-
 ARROW_FLIGHT_EXPORT
 Status ExampleTlsCertificates(std::vector<CertKeyPair>* out);
 

From 4d200dc17daf268863df6f0d7c458cb460904a7c Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Wed, 14 Aug 2024 06:31:47 +0530
Subject: [PATCH 008/157] GH-43577: [Java] getBuffers method needs correction
 on clear flag usage (#43583)

### Rationale for this change

`getBuffers` method provides the capability to clear the buffers in the vector, this has not been properly tested while clear flag is not properly used in the implementation across various types of vectors.

### What changes are included in this PR?

Updating the vector `getBuffers` method to use `clear` flag as expected and adding corresponding test cases.

### Are these changes tested?

Yes, via existing test cases and new test cases.

### Are there any user-facing changes?

Yes
* GitHub Issue: #43577

Authored-by: Vibhatha Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../arrow/vector/complex/AbstractStructVector.java  | 11 +++++++++++
 .../vector/complex/BaseRepeatedValueVector.java     | 11 +++++++++++
 .../arrow/vector/complex/FixedSizeListVector.java   | 11 +++++++++++
 .../arrow/vector/complex/LargeListVector.java       |  9 +++++----
 .../arrow/vector/complex/LargeListViewVector.java   |  5 +++--
 .../org/apache/arrow/vector/complex/ListVector.java |  9 +++++----
 .../apache/arrow/vector/complex/ListViewVector.java |  3 ++-
 .../apache/arrow/vector/complex/StructVector.java   |  9 +++++----
 .../org/apache/arrow/vector/TestVectorReset.java    | 13 ++++++++++++-
 9 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractStructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractStructVector.java
index feb7edfec94..2921e43cb64 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractStructVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractStructVector.java
@@ -382,6 +382,17 @@ public VectorWithOrdinal getChildVectorWithOrdinal(String name) {
     return new VectorWithOrdinal(vector, ordinal);
   }
 
+  /**
+   * Return the underlying buffers associated with this vector. Note that this doesn't impact the
+   * reference counts for this buffer, so it only should be used for in-context access. Also note
+   * that this buffer changes regularly, thus external classes shouldn't hold a reference to it
+   * (unless they change it).
+   *
+   * @param clear Whether to clear vector before returning, the buffers will still be refcounted but
+   *     the returned array will be the only reference to them. Also, this won't clear the child
+   *     buffers.
+   * @return The underlying {@link ArrowBuf buffers} that is used by this vector instance.
+   */
   @Override
   public ArrowBuf[] getBuffers(boolean clear) {
     final List<ArrowBuf> buffers = new ArrayList<>();
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
index 1cdb87eba03..fbe83bad52c 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
@@ -271,6 +271,17 @@ public void reset() {
     valueCount = 0;
   }
 
+  /**
+   * Return the underlying buffers associated with this vector. Note that this doesn't impact the
+   * reference counts for this buffer, so it only should be used for in-context access. Also note
+   * that this buffer changes regularly, thus external classes shouldn't hold a reference to it
+   * (unless they change it).
+   *
+   * @param clear Whether to clear vector before returning, the buffers will still be refcounted but
+   *     the returned array will be the only reference to them. Also, this won't clear the child
+   *     buffers.
+   * @return The underlying {@link ArrowBuf buffers} that is used by this vector instance.
+   */
   @Override
   public ArrowBuf[] getBuffers(boolean clear) {
     final ArrowBuf[] buffers;
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
index cb455084808..c762eb51725 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
@@ -360,6 +360,17 @@ public void reset() {
     valueCount = 0;
   }
 
+  /**
+   * Return the underlying buffers associated with this vector. Note that this doesn't impact the
+   * reference counts for this buffer, so it only should be used for in-context access. Also note
+   * that this buffer changes regularly, thus external classes shouldn't hold a reference to it
+   * (unless they change it).
+   *
+   * @param clear Whether to clear vector before returning, the buffers will still be refcounted but
+   *     the returned array will be the only reference to them. Also, this won't clear the child
+   *     buffers.
+   * @return The underlying {@link ArrowBuf buffers} that is used by this vector instance.
+   */
   @Override
   public ArrowBuf[] getBuffers(boolean clear) {
     setReaderAndWriterIndex();
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
index b5b32c8032d..ed075352c93 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
@@ -882,12 +882,13 @@ public void reset() {
 
   /**
    * Return the underlying buffers associated with this vector. Note that this doesn't impact the
-   * reference counts for this buffer so it only should be used for in-context access. Also note
-   * that this buffer changes regularly thus external classes shouldn't hold a reference to it
+   * reference counts for this buffer, so it only should be used for in-context access. Also note
+   * that this buffer changes regularly, thus external classes shouldn't hold a reference to it
    * (unless they change it).
    *
-   * @param clear Whether to clear vector before returning; the buffers will still be refcounted but
-   *     the returned array will be the only reference to them
+   * @param clear Whether to clear vector before returning, the buffers will still be refcounted but
+   *     the returned array will be the only reference to them. Also, this won't clear the child
+   *     buffers.
    * @return The underlying {@link ArrowBuf buffers} that is used by this vector instance.
    */
   @Override
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
index 17ccdbf0eae..f6b3de88b77 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
@@ -546,7 +546,8 @@ public void reset() {
    * (unless they change it).
    *
    * @param clear Whether to clear vector before returning, the buffers will still be refcounted but
-   *     the returned array will be the only reference to them
+   *     the returned array will be the only reference to them. Also, this won't clear the child
+   *     buffers.
    * @return The underlying {@link ArrowBuf buffers} that is used by this vector instance.
    */
   @Override
@@ -561,7 +562,7 @@ public ArrowBuf[] getBuffers(boolean clear) {
       list.add(validityBuffer);
       list.add(offsetBuffer);
       list.add(sizeBuffer);
-      list.addAll(Arrays.asList(vector.getBuffers(clear)));
+      list.addAll(Arrays.asList(vector.getBuffers(false)));
       buffers = list.toArray(new ArrowBuf[list.size()]);
     }
     if (clear) {
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index a1e18210fc6..76682c28fe6 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -726,12 +726,13 @@ public void reset() {
 
   /**
    * Return the underlying buffers associated with this vector. Note that this doesn't impact the
-   * reference counts for this buffer so it only should be used for in-context access. Also note
-   * that this buffer changes regularly thus external classes shouldn't hold a reference to it
+   * reference counts for this buffer, so it only should be used for in-context access. Also note
+   * that this buffer changes regularly, thus external classes shouldn't hold a reference to it
    * (unless they change it).
    *
-   * @param clear Whether to clear vector before returning; the buffers will still be refcounted but
-   *     the returned array will be the only reference to them
+   * @param clear Whether to clear vector before returning, the buffers will still be refcounted but
+   *     the returned array will be the only reference to them. Also, this won't clear the child
+   *     buffers.
    * @return The underlying {@link ArrowBuf buffers} that is used by this vector instance.
    */
   @Override
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java
index 6ced66d81ec..7f6d92f3be9 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java
@@ -704,7 +704,8 @@ public void reset() {
    * (unless they change it).
    *
    * @param clear Whether to clear vector before returning, the buffers will still be refcounted but
-   *     the returned array will be the only reference to them
+   *     the returned array will be the only reference to them. Also, this won't clear the child
+   *     buffers.
    * @return The underlying {@link ArrowBuf buffers} that is used by this vector instance.
    */
   @Override
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
index dda9b6547f7..ca5f572034c 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
@@ -396,12 +396,13 @@ public int getValueCapacity() {
 
   /**
    * Return the underlying buffers associated with this vector. Note that this doesn't impact the
-   * reference counts for this buffer so it only should be used for in-context access. Also note
-   * that this buffer changes regularly thus external classes shouldn't hold a reference to it
+   * reference counts for this buffer, so it only should be used for in-context access. Also note
+   * that this buffer changes regularly, thus external classes shouldn't hold a reference to it
    * (unless they change it).
    *
-   * @param clear Whether to clear vector before returning; the buffers will still be refcounted but
-   *     the returned array will be the only reference to them
+   * @param clear Whether to clear vector before returning, the buffers will still be refcounted but
+   *     the returned array will be the only reference to them. Also, this won't clear the child
+   *     buffers.
    * @return The underlying {@link ArrowBuf buffers} that is used by this vector instance.
    */
   @Override
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java
index 48cf78a4c2e..28d73a8fdff 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java
@@ -25,6 +25,7 @@
 import org.apache.arrow.memory.RootAllocator;
 import org.apache.arrow.vector.complex.FixedSizeListVector;
 import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.NonNullableStructVector;
 import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.complex.UnionVector;
@@ -122,7 +123,10 @@ public void testListTypeReset() {
                 "VarList", allocator, FieldType.nullable(MinorType.INT.getType()), null);
         final FixedSizeListVector fixedList =
             new FixedSizeListVector(
-                "FixedList", allocator, FieldType.nullable(new FixedSizeList(2)), null)) {
+                "FixedList", allocator, FieldType.nullable(new FixedSizeList(2)), null);
+        final ListViewVector variableViewList =
+            new ListViewVector(
+                "VarListView", allocator, FieldType.nullable(MinorType.INT.getType()), null)) {
       // ListVector
       variableList.allocateNewSafe();
       variableList.startNewValue(0);
@@ -136,6 +140,13 @@ public void testListTypeReset() {
       fixedList.setNull(0);
       fixedList.setValueCount(1);
       resetVectorAndVerify(fixedList, fixedList.getBuffers(false));
+
+      // ListViewVector
+      variableViewList.allocateNewSafe();
+      variableViewList.startNewValue(0);
+      variableViewList.endValue(0, 0);
+      variableViewList.setValueCount(1);
+      resetVectorAndVerify(variableViewList, variableViewList.getBuffers(false));
     }
   }
 

From 6e7125b61f2ff587a09dbe45ab05d2f28632a702 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Wed, 14 Aug 2024 10:41:38 +0900
Subject: [PATCH 009/157] GH-43454: [C++][Python] Add Opaque canonical
 extension type (#43458)

### Rationale for this change

Add the newly ratified extension type.

### What changes are included in this PR?

The C++/Python implementation only.

### Are these changes tested?

Yes

### Are there any user-facing changes?

No.
* GitHub Issue: #43454

Lead-authored-by: David Li <li.davidm96@gmail.com>
Co-authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/CMakeLists.txt                  |   1 +
 .../compute/kernels/scalar_cast_numeric.cc    |  23 ++
 cpp/src/arrow/extension/CMakeLists.txt        |   6 +
 cpp/src/arrow/extension/opaque.cc             | 109 ++++++++++
 cpp/src/arrow/extension/opaque.h              |  69 ++++++
 cpp/src/arrow/extension/opaque_test.cc        | 197 ++++++++++++++++++
 docs/source/python/api/arrays.rst             |   3 +
 docs/source/python/api/datatypes.rst          |  10 +
 python/pyarrow/__init__.py                    |   8 +-
 python/pyarrow/array.pxi                      |  28 +++
 python/pyarrow/includes/libarrow.pxd          |  13 ++
 python/pyarrow/lib.pxd                        |   5 +
 python/pyarrow/public-api.pxi                 |   2 +
 python/pyarrow/scalar.pxi                     |   6 +
 python/pyarrow/tests/test_extension_type.py   |  46 ++++
 python/pyarrow/tests/test_misc.py             |   3 +
 python/pyarrow/types.pxi                      | 101 +++++++++
 17 files changed, 627 insertions(+), 3 deletions(-)
 create mode 100644 cpp/src/arrow/extension/opaque.cc
 create mode 100644 cpp/src/arrow/extension/opaque.h
 create mode 100644 cpp/src/arrow/extension/opaque_test.cc

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 9c66a58c542..67d2c19f98a 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -907,6 +907,7 @@ endif()
 if(ARROW_JSON)
   arrow_add_object_library(ARROW_JSON
                            extension/fixed_shape_tensor.cc
+                           extension/opaque.cc
                            json/options.cc
                            json/chunked_builder.cc
                            json/chunker.cc
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index 3df86e7d693..bd9be3e8a95 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -865,6 +865,25 @@ std::shared_ptr<CastFunction> GetCastToHalfFloat() {
   return func;
 }
 
+struct NullExtensionTypeMatcher : public TypeMatcher {
+  ~NullExtensionTypeMatcher() override = default;
+
+  bool Matches(const DataType& type) const override {
+    return type.id() == Type::EXTENSION &&
+           checked_cast<const ExtensionType&>(type).storage_id() == Type::NA;
+  }
+
+  std::string ToString() const override { return "extension<storage_type: null>"; }
+
+  bool Equals(const TypeMatcher& other) const override {
+    if (this == &other) {
+      return true;
+    }
+    auto casted = dynamic_cast<const NullExtensionTypeMatcher*>(&other);
+    return casted != nullptr;
+  }
+};
+
 }  // namespace
 
 std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
@@ -875,6 +894,10 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
   auto cast_null = std::make_shared<CastFunction>("cast_null", Type::NA);
   DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, null(),
                                  OutputAllNull));
+  // Explicitly allow casting extension type with null backing array to null
+  DCHECK_OK(cast_null->AddKernel(
+      Type::EXTENSION, {InputType(std::make_shared<NullExtensionTypeMatcher>())}, null(),
+      OutputAllNull));
   functions.push_back(cast_null);
 
   functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));
diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt
index c15c42874d4..6741ab602f5 100644
--- a/cpp/src/arrow/extension/CMakeLists.txt
+++ b/cpp/src/arrow/extension/CMakeLists.txt
@@ -21,4 +21,10 @@ add_arrow_test(test
                PREFIX
                "arrow-fixed-shape-tensor")
 
+add_arrow_test(test
+               SOURCES
+               opaque_test.cc
+               PREFIX
+               "arrow-extension-opaque")
+
 arrow_install_all_headers("arrow/extension")
diff --git a/cpp/src/arrow/extension/opaque.cc b/cpp/src/arrow/extension/opaque.cc
new file mode 100644
index 00000000000..c430bb5d2ea
--- /dev/null
+++ b/cpp/src/arrow/extension/opaque.cc
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/opaque.h"
+
+#include <sstream>
+
+#include "arrow/json/rapidjson_defs.h"  // IWYU pragma: keep
+#include "arrow/util/logging.h"
+
+#include <rapidjson/document.h>
+#include <rapidjson/error/en.h>
+#include <rapidjson/writer.h>
+
+namespace arrow::extension {
+
+std::string OpaqueType::ToString(bool show_metadata) const {
+  std::stringstream ss;
+  ss << "extension<" << this->extension_name()
+     << "[storage_type=" << storage_type_->ToString(show_metadata)
+     << ", type_name=" << type_name_ << ", vendor_name=" << vendor_name_ << "]>";
+  return ss.str();
+}
+
+bool OpaqueType::ExtensionEquals(const ExtensionType& other) const {
+  if (extension_name() != other.extension_name()) {
+    return false;
+  }
+  const auto& opaque = internal::checked_cast<const OpaqueType&>(other);
+  return storage_type()->Equals(*opaque.storage_type()) &&
+         type_name() == opaque.type_name() && vendor_name() == opaque.vendor_name();
+}
+
+std::string OpaqueType::Serialize() const {
+  rapidjson::Document document;
+  document.SetObject();
+  rapidjson::Document::AllocatorType& allocator = document.GetAllocator();
+
+  rapidjson::Value type_name(rapidjson::StringRef(type_name_));
+  document.AddMember(rapidjson::Value("type_name", allocator), type_name, allocator);
+  rapidjson::Value vendor_name(rapidjson::StringRef(vendor_name_));
+  document.AddMember(rapidjson::Value("vendor_name", allocator), vendor_name, allocator);
+
+  rapidjson::StringBuffer buffer;
+  rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+  document.Accept(writer);
+  return buffer.GetString();
+}
+
+Result<std::shared_ptr<DataType>> OpaqueType::Deserialize(
+    std::shared_ptr<DataType> storage_type, const std::string& serialized_data) const {
+  rapidjson::Document document;
+  const auto& parsed = document.Parse(serialized_data.data(), serialized_data.length());
+  if (parsed.HasParseError()) {
+    return Status::Invalid("Invalid serialized JSON data for OpaqueType: ",
+                           rapidjson::GetParseError_En(parsed.GetParseError()), ": ",
+                           serialized_data);
+  } else if (!document.IsObject()) {
+    return Status::Invalid("Invalid serialized JSON data for OpaqueType: not an object");
+  }
+  if (!document.HasMember("type_name")) {
+    return Status::Invalid(
+        "Invalid serialized JSON data for OpaqueType: missing type_name");
+  } else if (!document.HasMember("vendor_name")) {
+    return Status::Invalid(
+        "Invalid serialized JSON data for OpaqueType: missing vendor_name");
+  }
+
+  const auto& type_name = document["type_name"];
+  const auto& vendor_name = document["vendor_name"];
+  if (!type_name.IsString()) {
+    return Status::Invalid(
+        "Invalid serialized JSON data for OpaqueType: type_name is not a string");
+  } else if (!vendor_name.IsString()) {
+    return Status::Invalid(
+        "Invalid serialized JSON data for OpaqueType: vendor_name is not a string");
+  }
+
+  return opaque(std::move(storage_type), type_name.GetString(), vendor_name.GetString());
+}
+
+std::shared_ptr<Array> OpaqueType::MakeArray(std::shared_ptr<ArrayData> data) const {
+  DCHECK_EQ(data->type->id(), Type::EXTENSION);
+  DCHECK_EQ("arrow.opaque",
+            internal::checked_cast<const ExtensionType&>(*data->type).extension_name());
+  return std::make_shared<OpaqueArray>(data);
+}
+
+std::shared_ptr<DataType> opaque(std::shared_ptr<DataType> storage_type,
+                                 std::string type_name, std::string vendor_name) {
+  return std::make_shared<OpaqueType>(std::move(storage_type), std::move(type_name),
+                                      std::move(vendor_name));
+}
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/opaque.h b/cpp/src/arrow/extension/opaque.h
new file mode 100644
index 00000000000..9814b391cba
--- /dev/null
+++ b/cpp/src/arrow/extension/opaque.h
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension_type.h"
+#include "arrow/type.h"
+
+namespace arrow::extension {
+
+/// \brief Opaque is a placeholder for a type from an external (usually
+///   non-Arrow) system that could not be interpreted.
+class ARROW_EXPORT OpaqueType : public ExtensionType {
+ public:
+  /// \brief Construct an OpaqueType.
+  ///
+  /// \param[in] storage_type The underlying storage type.  Should be
+  ///   arrow::null if there is no data.
+  /// \param[in] type_name The name of the type in the external system.
+  /// \param[in] vendor_name The name of the external system.
+  explicit OpaqueType(std::shared_ptr<DataType> storage_type, std::string type_name,
+                      std::string vendor_name)
+      : ExtensionType(std::move(storage_type)),
+        type_name_(std::move(type_name)),
+        vendor_name_(std::move(vendor_name)) {}
+
+  std::string extension_name() const override { return "arrow.opaque"; }
+  std::string ToString(bool show_metadata) const override;
+  bool ExtensionEquals(const ExtensionType& other) const override;
+  std::string Serialize() const override;
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const override;
+  /// Create an OpaqueArray from ArrayData
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  std::string_view type_name() const { return type_name_; }
+  std::string_view vendor_name() const { return vendor_name_; }
+
+ private:
+  std::string type_name_;
+  std::string vendor_name_;
+};
+
+/// \brief Opaque is a wrapper for (usually binary) data from an external
+///   (often non-Arrow) system that could not be interpreted.
+class ARROW_EXPORT OpaqueArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+/// \brief Return an OpaqueType instance.
+ARROW_EXPORT std::shared_ptr<DataType> opaque(std::shared_ptr<DataType> storage_type,
+                                              std::string type_name,
+                                              std::string vendor_name);
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/opaque_test.cc b/cpp/src/arrow/extension/opaque_test.cc
new file mode 100644
index 00000000000..1629cdb3965
--- /dev/null
+++ b/cpp/src/arrow/extension/opaque_test.cc
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "arrow/extension/fixed_shape_tensor.h"
+#include "arrow/extension/opaque.h"
+#include "arrow/extension_type.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/record_batch.h"
+#include "arrow/testing/extension_type.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+
+TEST(OpaqueType, Basics) {
+  auto type = internal::checked_pointer_cast<extension::OpaqueType>(
+      extension::opaque(null(), "type", "vendor"));
+  auto type2 = internal::checked_pointer_cast<extension::OpaqueType>(
+      extension::opaque(null(), "type2", "vendor"));
+  ASSERT_EQ("arrow.opaque", type->extension_name());
+  ASSERT_EQ(*type, *type);
+  ASSERT_NE(*arrow::null(), *type);
+  ASSERT_NE(*type, *type2);
+  ASSERT_EQ(*arrow::null(), *type->storage_type());
+  ASSERT_THAT(type->Serialize(), ::testing::Not(::testing::IsEmpty()));
+  ASSERT_EQ(R"({"type_name":"type","vendor_name":"vendor"})", type->Serialize());
+  ASSERT_EQ("type", type->type_name());
+  ASSERT_EQ("vendor", type->vendor_name());
+  ASSERT_EQ(
+      "extension<arrow.opaque[storage_type=null, type_name=type, vendor_name=vendor]>",
+      type->ToString(false));
+}
+
+TEST(OpaqueType, Equals) {
+  auto type = internal::checked_pointer_cast<extension::OpaqueType>(
+      extension::opaque(null(), "type", "vendor"));
+  auto type2 = internal::checked_pointer_cast<extension::OpaqueType>(
+      extension::opaque(null(), "type2", "vendor"));
+  auto type3 = internal::checked_pointer_cast<extension::OpaqueType>(
+      extension::opaque(null(), "type", "vendor2"));
+  auto type4 = internal::checked_pointer_cast<extension::OpaqueType>(
+      extension::opaque(int64(), "type", "vendor"));
+  auto type5 = internal::checked_pointer_cast<extension::OpaqueType>(
+      extension::opaque(null(), "type", "vendor"));
+  auto type6 = internal::checked_pointer_cast<extension::FixedShapeTensorType>(
+      extension::fixed_shape_tensor(float64(), {1}));
+
+  ASSERT_EQ(*type, *type);
+  ASSERT_EQ(*type2, *type2);
+  ASSERT_EQ(*type3, *type3);
+  ASSERT_EQ(*type4, *type4);
+  ASSERT_EQ(*type5, *type5);
+
+  ASSERT_EQ(*type, *type5);
+
+  ASSERT_NE(*type, *type2);
+  ASSERT_NE(*type, *type3);
+  ASSERT_NE(*type, *type4);
+  ASSERT_NE(*type, *type6);
+
+  ASSERT_NE(*type2, *type);
+  ASSERT_NE(*type2, *type3);
+  ASSERT_NE(*type2, *type4);
+  ASSERT_NE(*type2, *type6);
+
+  ASSERT_NE(*type3, *type);
+  ASSERT_NE(*type3, *type2);
+  ASSERT_NE(*type3, *type4);
+  ASSERT_NE(*type3, *type6);
+
+  ASSERT_NE(*type4, *type);
+  ASSERT_NE(*type4, *type2);
+  ASSERT_NE(*type4, *type3);
+  ASSERT_NE(*type4, *type6);
+  ASSERT_NE(*type6, *type4);
+}
+
+TEST(OpaqueType, CreateFromArray) {
+  auto type = internal::checked_pointer_cast<extension::OpaqueType>(
+      extension::opaque(binary(), "geometry", "adbc.postgresql"));
+  auto storage = ArrayFromJSON(binary(), R"(["foobar", null])");
+  auto array = ExtensionType::WrapArray(type, storage);
+  ASSERT_EQ(2, array->length());
+  ASSERT_EQ(1, array->null_count());
+}
+
+void CheckDeserialize(const std::string& serialized,
+                      const std::shared_ptr<DataType>& expected) {
+  auto type = internal::checked_pointer_cast<extension::OpaqueType>(expected);
+  ASSERT_OK_AND_ASSIGN(auto deserialized,
+                       type->Deserialize(type->storage_type(), serialized));
+  ASSERT_EQ(*expected, *deserialized);
+}
+
+TEST(OpaqueType, Deserialize) {
+  ASSERT_NO_FATAL_FAILURE(
+      CheckDeserialize(R"({"type_name": "type", "vendor_name": "vendor"})",
+                       extension::opaque(null(), "type", "vendor")));
+  ASSERT_NO_FATAL_FAILURE(
+      CheckDeserialize(R"({"type_name": "long name", "vendor_name": "long name"})",
+                       extension::opaque(null(), "long name", "long name")));
+  ASSERT_NO_FATAL_FAILURE(
+      CheckDeserialize(R"({"type_name": "名前", "vendor_name": "名字"})",
+                       extension::opaque(null(), "名前", "名字")));
+  ASSERT_NO_FATAL_FAILURE(CheckDeserialize(
+      R"({"type_name": "type", "vendor_name": "vendor", "extra_field": 2})",
+      extension::opaque(null(), "type", "vendor")));
+
+  auto type = internal::checked_pointer_cast<extension::OpaqueType>(
+      extension::opaque(null(), "type", "vendor"));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("The document is empty"),
+                                  type->Deserialize(null(), R"()"));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                  testing::HasSubstr("Missing a name for object member"),
+                                  type->Deserialize(null(), R"({)"));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("not an object"),
+                                  type->Deserialize(null(), R"([])"));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("missing type_name"),
+                                  type->Deserialize(null(), R"({})"));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, testing::HasSubstr("type_name is not a string"),
+      type->Deserialize(null(), R"({"type_name": 2, "vendor_name": ""})"));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, testing::HasSubstr("type_name is not a string"),
+      type->Deserialize(null(), R"({"type_name": null, "vendor_name": ""})"));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, testing::HasSubstr("vendor_name is not a string"),
+      type->Deserialize(null(), R"({"vendor_name": 2, "type_name": ""})"));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, testing::HasSubstr("vendor_name is not a string"),
+      type->Deserialize(null(), R"({"vendor_name": null, "type_name": ""})"));
+}
+
+TEST(OpaqueType, MetadataRoundTrip) {
+  for (const auto& type : {
+           extension::opaque(null(), "foo", "bar"),
+           extension::opaque(binary(), "geometry", "postgis"),
+           extension::opaque(fixed_size_list(int64(), 4), "foo", "bar"),
+           extension::opaque(utf8(), "foo", "bar"),
+       }) {
+    auto opaque = internal::checked_pointer_cast<extension::OpaqueType>(type);
+    std::string serialized = opaque->Serialize();
+    ASSERT_OK_AND_ASSIGN(auto deserialized,
+                         opaque->Deserialize(opaque->storage_type(), serialized));
+    ASSERT_EQ(*type, *deserialized);
+  }
+}
+
+TEST(OpaqueType, BatchRoundTrip) {
+  auto type = internal::checked_pointer_cast<extension::OpaqueType>(
+      extension::opaque(binary(), "geometry", "adbc.postgresql"));
+  ExtensionTypeGuard guard(type);
+
+  auto storage = ArrayFromJSON(binary(), R"(["foobar", null])");
+  auto array = ExtensionType::WrapArray(type, storage);
+  auto batch =
+      RecordBatch::Make(schema({field("field", type)}), array->length(), {array});
+
+  std::shared_ptr<RecordBatch> written;
+  {
+    ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
+    ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
+                                          out_stream.get()));
+
+    ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
+
+    io::BufferReader reader(complete_ipc_stream);
+    std::shared_ptr<RecordBatchReader> batch_reader;
+    ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
+    ASSERT_OK(batch_reader->ReadNext(&written));
+  }
+
+  ASSERT_EQ(*batch->schema(), *written->schema());
+  ASSERT_BATCHES_EQUAL(*batch, *written);
+}
+
+}  // namespace arrow
diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst
index aefed00b3d2..4ad35b190cd 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -85,6 +85,7 @@ may expose data type-specific methods or properties.
    UnionArray
    ExtensionArray
    FixedShapeTensorArray
+   OpaqueArray
 
 .. _api.scalar:
 
@@ -143,3 +144,5 @@ classes may expose data type-specific methods or properties.
    StructScalar
    UnionScalar
    ExtensionScalar
+   FixedShapeTensorScalar
+   OpaqueScalar
diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst
index 7edb4e16154..a43c5299eae 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -67,6 +67,8 @@ These should be used to create Arrow data types and schemas.
    struct
    dictionary
    run_end_encoded
+   fixed_shape_tensor
+   opaque
    field
    schema
    from_numpy_dtype
@@ -117,6 +119,14 @@ Specific classes and functions for extension types.
    register_extension_type
    unregister_extension_type
 
+:doc:`Canonical extension types <../../format/CanonicalExtensions>`
+implemented by PyArrow.
+
+.. autosummary::
+   :toctree: ../generated/
+
+   FixedShapeTensorType
+   OpaqueType
 
 .. _api.types.checking:
 .. currentmodule:: pyarrow.types
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index e52e0d242be..aa7bab9f97e 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -173,6 +173,7 @@ def print_entry(label, value):
                          dictionary,
                          run_end_encoded,
                          fixed_shape_tensor,
+                         opaque,
                          field,
                          type_for_alias,
                          DataType, DictionaryType, StructType,
@@ -182,7 +183,7 @@ def print_entry(label, value):
                          TimestampType, Time32Type, Time64Type, DurationType,
                          FixedSizeBinaryType, Decimal128Type, Decimal256Type,
                          BaseExtensionType, ExtensionType,
-                         RunEndEncodedType, FixedShapeTensorType,
+                         RunEndEncodedType, FixedShapeTensorType, OpaqueType,
                          PyExtensionType, UnknownExtensionType,
                          register_extension_type, unregister_extension_type,
                          DictionaryMemo,
@@ -216,7 +217,7 @@ def print_entry(label, value):
                          Time32Array, Time64Array, DurationArray,
                          MonthDayNanoIntervalArray,
                          Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
-                         RunEndEncodedArray, FixedShapeTensorArray,
+                         RunEndEncodedArray, FixedShapeTensorArray, OpaqueArray,
                          scalar, NA, _NULL as NULL, Scalar,
                          NullScalar, BooleanScalar,
                          Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
@@ -233,7 +234,8 @@ def print_entry(label, value):
                          StringScalar, LargeStringScalar, StringViewScalar,
                          FixedSizeBinaryScalar, DictionaryScalar,
                          MapScalar, StructScalar, UnionScalar,
-                         RunEndEncodedScalar, ExtensionScalar)
+                         RunEndEncodedScalar, ExtensionScalar,
+                         FixedShapeTensorScalar, OpaqueScalar)
 
 # Buffers, allocation
 from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 997f208a5de..6c40a21db96 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -4448,6 +4448,34 @@ cdef class FixedShapeTensorArray(ExtensionArray):
         )
 
 
+cdef class OpaqueArray(ExtensionArray):
+    """
+    Concrete class for opaque extension arrays.
+
+    Examples
+    --------
+    Define the extension type for an opaque array
+
+    >>> import pyarrow as pa
+    >>> opaque_type = pa.opaque(
+    ...     pa.binary(),
+    ...     type_name="geometry",
+    ...     vendor_name="postgis",
+    ... )
+
+    Create an extension array
+
+    >>> arr = [None, b"data"]
+    >>> storage = pa.array(arr, pa.binary())
+    >>> pa.ExtensionArray.from_storage(opaque_type, storage)
+    <pyarrow.lib.OpaqueArray object at ...>
+    [
+      null,
+      64617461
+    ]
+    """
+
+
 cdef dict _array_classes = {
     _Type_NA: NullArray,
     _Type_BOOL: BooleanArray,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 0d871f411b1..9b008d150f1 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2882,6 +2882,19 @@ cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extens
             " arrow::extension::FixedShapeTensorArray"(CExtensionArray):
         const CResult[shared_ptr[CTensor]] ToTensor() const
 
+
+cdef extern from "arrow/extension/opaque.h" namespace "arrow::extension" nogil:
+    cdef cppclass COpaqueType \
+            " arrow::extension::OpaqueType"(CExtensionType):
+
+        c_string type_name()
+        c_string vendor_name()
+
+    cdef cppclass COpaqueArray \
+            " arrow::extension::OpaqueArray"(CExtensionArray):
+        pass
+
+
 cdef extern from "arrow/util/compression.h" namespace "arrow" nogil:
     cdef enum CCompressionType" arrow::Compression::type":
         CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED"
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 082d8470cdb..2cb302d20a8 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -215,6 +215,11 @@ cdef class FixedShapeTensorType(BaseExtensionType):
         const CFixedShapeTensorType* tensor_ext_type
 
 
+cdef class OpaqueType(BaseExtensionType):
+    cdef:
+        const COpaqueType* opaque_ext_type
+
+
 cdef class PyExtensionType(ExtensionType):
     pass
 
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 966273b4bea..2f9fc1c5542 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -124,6 +124,8 @@ cdef api object pyarrow_wrap_data_type(
             return cpy_ext_type.GetInstance()
         elif ext_type.extension_name() == b"arrow.fixed_shape_tensor":
             out = FixedShapeTensorType.__new__(FixedShapeTensorType)
+        elif ext_type.extension_name() == b"arrow.opaque":
+            out = OpaqueType.__new__(OpaqueType)
         else:
             out = BaseExtensionType.__new__(BaseExtensionType)
     else:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 41bfde39adb..12a99c2aece 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -1085,6 +1085,12 @@ cdef class FixedShapeTensorScalar(ExtensionScalar):
         return pyarrow_wrap_tensor(ctensor)
 
 
+cdef class OpaqueScalar(ExtensionScalar):
+    """
+    Concrete class for opaque extension scalar.
+    """
+
+
 cdef dict _scalar_classes = {
     _Type_BOOL: BooleanScalar,
     _Type_UINT8: UInt8Scalar,
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index 1c4d0175a2d..58c54189f22 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1661,3 +1661,49 @@ def test_legacy_int_type():
             batch = ipc_read_batch(buf)
             assert isinstance(batch.column(0).type, LegacyIntType)
             assert batch.column(0) == ext_arr
+
+
+@pytest.mark.parametrize("storage_type,storage", [
+    (pa.null(), [None] * 4),
+    (pa.int64(), [1, 2, None, 4]),
+    (pa.binary(), [None, b"foobar"]),
+    (pa.list_(pa.int64()), [[], [1, 2], None, [3, None]]),
+])
+def test_opaque_type(pickle_module, storage_type, storage):
+    opaque_type = pa.opaque(storage_type, "type", "vendor")
+    assert opaque_type.extension_name == "arrow.opaque"
+    assert opaque_type.storage_type == storage_type
+    assert opaque_type.type_name == "type"
+    assert opaque_type.vendor_name == "vendor"
+    assert "arrow.opaque" in str(opaque_type)
+
+    assert opaque_type == opaque_type
+    assert opaque_type != storage_type
+    assert opaque_type != pa.opaque(storage_type, "type2", "vendor")
+    assert opaque_type != pa.opaque(storage_type, "type", "vendor2")
+    assert opaque_type != pa.opaque(pa.decimal128(12, 3), "type", "vendor")
+
+    # Pickle roundtrip
+    result = pickle_module.loads(pickle_module.dumps(opaque_type))
+    assert result == opaque_type
+
+    # IPC roundtrip
+    opaque_arr_class = opaque_type.__arrow_ext_class__()
+    storage = pa.array(storage, storage_type)
+    arr = pa.ExtensionArray.from_storage(opaque_type, storage)
+    assert isinstance(arr, opaque_arr_class)
+
+    with registered_extension_type(opaque_type):
+        buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
+        batch = ipc_read_batch(buf)
+
+    assert batch.column(0).type.extension_name == "arrow.opaque"
+    assert isinstance(batch.column(0), opaque_arr_class)
+
+    # cast storage -> extension type
+    result = storage.cast(opaque_type)
+    assert result == arr
+
+    # cast extension type -> storage type
+    inner = arr.cast(storage_type)
+    assert inner == storage
diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py
index c42e4fbdfc2..9a55a38177f 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -247,6 +247,9 @@ def test_set_timezone_db_path_non_windows():
     pa.ProxyMemoryPool,
     pa.Device,
     pa.MemoryManager,
+    pa.OpaqueArray,
+    pa.OpaqueScalar,
+    pa.OpaqueType,
 ])
 def test_extension_type_constructor_errors(klass):
     # ARROW-2638: prevent calling extension class constructors directly
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 039870accdd..93d68fb8478 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1837,6 +1837,50 @@ cdef class FixedShapeTensorType(BaseExtensionType):
         return FixedShapeTensorScalar
 
 
+cdef class OpaqueType(BaseExtensionType):
+    """
+    Concrete class for opaque extension type.
+
+    Opaque is a placeholder for a type from an external (often non-Arrow)
+    system that could not be interpreted.
+
+    Examples
+    --------
+    Create an instance of opaque extension type:
+
+    >>> import pyarrow as pa
+    >>> pa.opaque(pa.int32(), "geometry", "postgis")
+    OpaqueType(extension<arrow.opaque[storage_type=int32, type_name=geometry, vendor_name=postgis]>)
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        BaseExtensionType.init(self, type)
+        self.opaque_ext_type = <const COpaqueType*> type.get()
+
+    @property
+    def type_name(self):
+        """
+        The name of the type in the external system.
+        """
+        return frombytes(c_string(self.opaque_ext_type.type_name()))
+
+    @property
+    def vendor_name(self):
+        """
+        The name of the external system.
+        """
+        return frombytes(c_string(self.opaque_ext_type.vendor_name()))
+
+    def __arrow_ext_class__(self):
+        return OpaqueArray
+
+    def __reduce__(self):
+        return opaque, (self.storage_type, self.type_name, self.vendor_name)
+
+    def __arrow_ext_scalar_class__(self):
+        return OpaqueScalar
+
+
 _py_extension_type_auto_load = False
 
 
@@ -5234,6 +5278,63 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N
     return out
 
 
+def opaque(DataType storage_type, str type_name not None, str vendor_name not None):
+    """
+    Create instance of opaque extension type.
+
+    Parameters
+    ----------
+    storage_type : DataType
+        The underlying data type.
+    type_name : str
+        The name of the type in the external system.
+    vendor_name : str
+        The name of the external system.
+
+    Examples
+    --------
+    Create an instance of an opaque extension type:
+
+    >>> import pyarrow as pa
+    >>> type = pa.opaque(pa.binary(), "other", "jdbc")
+    >>> type
+    OpaqueType(extension<arrow.opaque[storage_type=binary, type_name=other, vendor_name=jdbc]>)
+
+    Inspect the data type:
+
+    >>> type.storage_type
+    DataType(binary)
+    >>> type.type_name
+    'other'
+    >>> type.vendor_name
+    'jdbc'
+
+    Create a table with an opaque array:
+
+    >>> arr = [None, b"foobar"]
+    >>> storage = pa.array(arr, pa.binary())
+    >>> other = pa.ExtensionArray.from_storage(type, storage)
+    >>> pa.table([other], names=["unknown_col"])
+    pyarrow.Table
+    unknown_col: extension<arrow.opaque[storage_type=binary, type_name=other, vendor_name=jdbc]>
+    ----
+    unknown_col: [[null,666F6F626172]]
+
+    Returns
+    -------
+    type : OpaqueType
+    """
+
+    cdef:
+        c_string c_type_name = tobytes(type_name)
+        c_string c_vendor_name = tobytes(vendor_name)
+        shared_ptr[CDataType] c_type = make_shared[COpaqueType](
+            storage_type.sp_type, c_type_name, c_vendor_name)
+        OpaqueType out = OpaqueType.__new__(OpaqueType)
+    out.init(c_type)
+    return out
+
+
 cdef dict _type_aliases = {
     'null': null,
     'bool': bool_,

From ce251a6721cfcd27ed76bbaa5cb1c824a5f23a94 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Wed, 14 Aug 2024 08:58:15 +0530
Subject: [PATCH 010/157] GH-41291: [Java] LargeListViewVector Implementation
 transferPair implementation (#43637)

### Rationale for this change

Integrating the `transferPair` and `copyFrom`  functionality to `LargeListViewVector`

- [X] https://github.com/apache/arrow/issues/41292

### What changes are included in this PR?

This PR includes the `TransferPairImpl`, corresponding functions and test cases.

### Are these changes tested?

Yes

### Are there any user-facing changes?

No
* GitHub Issue: #41291

Authored-by: Vibhatha Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../BaseLargeRepeatedValueViewVector.java     |   2 +-
 .../vector/complex/LargeListViewVector.java   | 163 ++++++-
 .../arrow/vector/TestLargeListViewVector.java | 456 ++++++++++++++++++
 .../arrow/vector/TestSplitAndTransfer.java    |  20 +
 4 files changed, 634 insertions(+), 7 deletions(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java
index 26079cbee95..f643306cfdc 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java
@@ -102,7 +102,7 @@ private void allocateBuffers() {
     sizeBuffer = allocateBuffers(sizeAllocationSizeInBytes);
   }
 
-  private ArrowBuf allocateBuffers(final long size) {
+  protected ArrowBuf allocateBuffers(final long size) {
     final int curSize = (int) size;
     ArrowBuf buffer = allocator.buffer(curSize);
     buffer.readerIndex(0);
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
index f6b3de88b77..2c61f799a4c 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
@@ -39,6 +39,7 @@
 import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.ValueIterableVector;
 import org.apache.arrow.vector.ValueVector;
+import org.apache.arrow.vector.ZeroVector;
 import org.apache.arrow.vector.compare.VectorVisitor;
 import org.apache.arrow.vector.complex.impl.UnionLargeListViewReader;
 import org.apache.arrow.vector.complex.impl.UnionLargeListViewWriter;
@@ -361,20 +362,17 @@ public TransferPair getTransferPair(Field field, BufferAllocator allocator) {
 
   @Override
   public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) {
-    throw new UnsupportedOperationException(
-        "LargeListViewVector does not support getTransferPair(String, BufferAllocator, CallBack) yet");
+    return new TransferImpl(ref, allocator, callBack);
   }
 
   @Override
   public TransferPair getTransferPair(Field field, BufferAllocator allocator, CallBack callBack) {
-    throw new UnsupportedOperationException(
-        "LargeListViewVector does not support getTransferPair(Field, BufferAllocator, CallBack) yet");
+    return new TransferImpl(field, allocator, callBack);
   }
 
   @Override
   public TransferPair makeTransferPair(ValueVector target) {
-    throw new UnsupportedOperationException(
-        "LargeListViewVector does not support makeTransferPair(ValueVector) yet");
+    return new TransferImpl((LargeListViewVector) target);
   }
 
   @Override
@@ -452,6 +450,159 @@ public <OUT, IN> OUT accept(VectorVisitor<OUT, IN> visitor, IN value) {
     return visitor.visit(this, value);
   }
 
+  private class TransferImpl implements TransferPair {
+
+    LargeListViewVector to;
+    TransferPair dataTransferPair;
+
+    public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) {
+      this(new LargeListViewVector(name, allocator, field.getFieldType(), callBack));
+    }
+
+    public TransferImpl(Field field, BufferAllocator allocator, CallBack callBack) {
+      this(new LargeListViewVector(field, allocator, callBack));
+    }
+
+    public TransferImpl(LargeListViewVector to) {
+      this.to = to;
+      to.addOrGetVector(vector.getField().getFieldType());
+      if (to.getDataVector() instanceof ZeroVector) {
+        to.addOrGetVector(vector.getField().getFieldType());
+      }
+      dataTransferPair = getDataVector().makeTransferPair(to.getDataVector());
+    }
+
+    @Override
+    public void transfer() {
+      to.clear();
+      dataTransferPair.transfer();
+      to.validityBuffer = transferBuffer(validityBuffer, to.allocator);
+      to.offsetBuffer = transferBuffer(offsetBuffer, to.allocator);
+      to.sizeBuffer = transferBuffer(sizeBuffer, to.allocator);
+      if (valueCount > 0) {
+        to.setValueCount(valueCount);
+      }
+      clear();
+    }
+
+    @Override
+    public void splitAndTransfer(int startIndex, int length) {
+      Preconditions.checkArgument(
+          startIndex >= 0 && length >= 0 && startIndex + length <= valueCount,
+          "Invalid parameters startIndex: %s, length: %s for valueCount: %s",
+          startIndex,
+          length,
+          valueCount);
+      to.clear();
+      if (length > 0) {
+        // we have to scan by index since there are out-of-order offsets
+        to.offsetBuffer = to.allocateBuffers((long) length * OFFSET_WIDTH);
+        to.sizeBuffer = to.allocateBuffers((long) length * SIZE_WIDTH);
+
+        /* splitAndTransfer the size buffer */
+        int maxOffsetAndSizeSum = Integer.MIN_VALUE;
+        int minOffsetValue = Integer.MAX_VALUE;
+        for (int i = 0; i < length; i++) {
+          final int offsetValue = offsetBuffer.getInt((long) (startIndex + i) * OFFSET_WIDTH);
+          final int sizeValue = sizeBuffer.getInt((long) (startIndex + i) * SIZE_WIDTH);
+          to.sizeBuffer.setInt((long) i * SIZE_WIDTH, sizeValue);
+          maxOffsetAndSizeSum = Math.max(maxOffsetAndSizeSum, offsetValue + sizeValue);
+          minOffsetValue = Math.min(minOffsetValue, offsetValue);
+        }
+
+        /* splitAndTransfer the offset buffer */
+        for (int i = 0; i < length; i++) {
+          final int offsetValue = offsetBuffer.getInt((long) (startIndex + i) * OFFSET_WIDTH);
+          final int relativeOffset = offsetValue - minOffsetValue;
+          to.offsetBuffer.setInt((long) i * OFFSET_WIDTH, relativeOffset);
+        }
+
+        /* splitAndTransfer the validity buffer */
+        splitAndTransferValidityBuffer(startIndex, length, to);
+
+        /* splitAndTransfer the data buffer */
+        final int childSliceLength = maxOffsetAndSizeSum - minOffsetValue;
+        dataTransferPair.splitAndTransfer(minOffsetValue, childSliceLength);
+        to.setValueCount(length);
+      }
+    }
+
+    /*
+     * transfer the validity.
+     */
+    private void splitAndTransferValidityBuffer(
+        int startIndex, int length, LargeListViewVector target) {
+      int firstByteSource = BitVectorHelper.byteIndex(startIndex);
+      int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1);
+      int byteSizeTarget = getValidityBufferSizeFromCount(length);
+      int offset = startIndex % 8;
+
+      if (length > 0) {
+        if (offset == 0) {
+          // slice
+          if (target.validityBuffer != null) {
+            target.validityBuffer.getReferenceManager().release();
+          }
+          target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget);
+          target.validityBuffer.getReferenceManager().retain(1);
+        } else {
+          /* Copy data
+           * When the first bit starts from the middle of a byte (offset != 0),
+           * copy data from src BitVector.
+           * Each byte in the target is composed by a part in i-th byte,
+           * another part in (i+1)-th byte.
+           */
+          target.allocateValidityBuffer(byteSizeTarget);
+
+          for (int i = 0; i < byteSizeTarget - 1; i++) {
+            byte b1 =
+                BitVectorHelper.getBitsFromCurrentByte(validityBuffer, firstByteSource + i, offset);
+            byte b2 =
+                BitVectorHelper.getBitsFromNextByte(
+                    validityBuffer, firstByteSource + i + 1, offset);
+
+            target.validityBuffer.setByte(i, (b1 + b2));
+          }
+
+          /* Copying the last piece is done in the following manner:
+           * if the source vector has 1 or more bytes remaining, we copy
+           * the last piece as a byte formed by shifting data
+           * from the current byte and the next byte.
+           *
+           * if the source vector has no more bytes remaining
+           * (we are at the last byte), we copy the last piece as a byte
+           * by shifting data from the current byte.
+           */
+          if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) {
+            byte b1 =
+                BitVectorHelper.getBitsFromCurrentByte(
+                    validityBuffer, firstByteSource + byteSizeTarget - 1, offset);
+            byte b2 =
+                BitVectorHelper.getBitsFromNextByte(
+                    validityBuffer, firstByteSource + byteSizeTarget, offset);
+
+            target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2);
+          } else {
+            byte b1 =
+                BitVectorHelper.getBitsFromCurrentByte(
+                    validityBuffer, firstByteSource + byteSizeTarget - 1, offset);
+            target.validityBuffer.setByte(byteSizeTarget - 1, b1);
+          }
+        }
+      }
+    }
+
+    @Override
+    public ValueVector getTo() {
+      return to;
+    }
+
+    @Override
+    public void copyValueSafe(int from, int to) {
+      this.to.copyFrom(from, to, LargeListViewVector.this);
+    }
+  }
+
   @Override
   protected FieldReader getReaderImpl() {
     throw new UnsupportedOperationException(
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java
index 563ac811c4f..2ed8d4d7005 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java
@@ -18,6 +18,7 @@
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertSame;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.util.ArrayList;
@@ -32,6 +33,7 @@
 import org.apache.arrow.vector.types.pojo.ArrowType;
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.arrow.vector.types.pojo.FieldType;
+import org.apache.arrow.vector.util.TransferPair;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -1639,6 +1641,460 @@ public void testOutOfOrderOffset1() {
     }
   }
 
+  private int validateSizeBufferAndCalculateMinOffset(
+      int start,
+      int splitLength,
+      ArrowBuf fromOffsetBuffer,
+      ArrowBuf fromSizeBuffer,
+      ArrowBuf toSizeBuffer) {
+    int minOffset = fromOffsetBuffer.getInt((long) start * LargeListViewVector.OFFSET_WIDTH);
+    int fromDataLength;
+    int toDataLength;
+
+    for (int i = 0; i < splitLength; i++) {
+      fromDataLength = fromSizeBuffer.getInt((long) (start + i) * LargeListViewVector.SIZE_WIDTH);
+      toDataLength = toSizeBuffer.getInt((long) (i) * LargeListViewVector.SIZE_WIDTH);
+
+      /* validate size */
+      assertEquals(
+          fromDataLength,
+          toDataLength,
+          "Different data lengths at index: " + i + " and start: " + start);
+
+      /* calculate minimum offset */
+      int currentOffset =
+          fromOffsetBuffer.getInt((long) (start + i) * LargeListViewVector.OFFSET_WIDTH);
+      if (currentOffset < minOffset) {
+        minOffset = currentOffset;
+      }
+    }
+
+    return minOffset;
+  }
+
+  private void validateOffsetBuffer(
+      int start,
+      int splitLength,
+      ArrowBuf fromOffsetBuffer,
+      ArrowBuf toOffsetBuffer,
+      int minOffset) {
+    int offset1;
+    int offset2;
+
+    for (int i = 0; i < splitLength; i++) {
+      offset1 = fromOffsetBuffer.getInt((long) (start + i) * LargeListViewVector.OFFSET_WIDTH);
+      offset2 = toOffsetBuffer.getInt((long) (i) * LargeListViewVector.OFFSET_WIDTH);
+      assertEquals(
+          offset1 - minOffset,
+          offset2,
+          "Different offset values at index: " + i + " and start: " + start);
+    }
+  }
+
+  private void validateDataBuffer(
+      int start,
+      int splitLength,
+      ArrowBuf fromOffsetBuffer,
+      ArrowBuf fromSizeBuffer,
+      BigIntVector fromDataVector,
+      ArrowBuf toOffsetBuffer,
+      BigIntVector toDataVector) {
+    int dataLength;
+    Long fromValue;
+    for (int i = 0; i < splitLength; i++) {
+      dataLength = fromSizeBuffer.getInt((long) (start + i) * LargeListViewVector.SIZE_WIDTH);
+      for (int j = 0; j < dataLength; j++) {
+        fromValue =
+            fromDataVector.getObject(
+                (fromOffsetBuffer.getInt((long) (start + i) * LargeListViewVector.OFFSET_WIDTH)
+                    + j));
+        Long toValue =
+            toDataVector.getObject(
+                (toOffsetBuffer.getInt((long) i * LargeListViewVector.OFFSET_WIDTH) + j));
+        assertEquals(
+            fromValue, toValue, "Different data values at index: " + i + " and start: " + start);
+      }
+    }
+  }
+
+  /**
+   * Validate split and transfer of data from fromVector to toVector. Note that this method assumes
+   * that the child vector is BigIntVector.
+   *
+   * @param start start index
+   * @param splitLength length of data to split and transfer
+   * @param fromVector fromVector
+   * @param toVector toVector
+   */
+  private void validateSplitAndTransfer(
+      TransferPair transferPair,
+      int start,
+      int splitLength,
+      LargeListViewVector fromVector,
+      LargeListViewVector toVector) {
+
+    transferPair.splitAndTransfer(start, splitLength);
+
+    /* get offsetBuffer of toVector */
+    final ArrowBuf toOffsetBuffer = toVector.getOffsetBuffer();
+
+    /* get sizeBuffer of toVector */
+    final ArrowBuf toSizeBuffer = toVector.getSizeBuffer();
+
+    /* get dataVector of toVector */
+    BigIntVector toDataVector = (BigIntVector) toVector.getDataVector();
+
+    /* get offsetBuffer of toVector */
+    final ArrowBuf fromOffsetBuffer = fromVector.getOffsetBuffer();
+
+    /* get sizeBuffer of toVector */
+    final ArrowBuf fromSizeBuffer = fromVector.getSizeBuffer();
+
+    /* get dataVector of toVector */
+    BigIntVector fromDataVector = (BigIntVector) fromVector.getDataVector();
+
+    /* validate size buffers */
+    int minOffset =
+        validateSizeBufferAndCalculateMinOffset(
+            start, splitLength, fromOffsetBuffer, fromSizeBuffer, toSizeBuffer);
+    /* validate offset buffers */
+    validateOffsetBuffer(start, splitLength, fromOffsetBuffer, toOffsetBuffer, minOffset);
+    /* validate data */
+    validateDataBuffer(
+        start,
+        splitLength,
+        fromOffsetBuffer,
+        fromSizeBuffer,
+        fromDataVector,
+        toOffsetBuffer,
+        toDataVector);
+  }
+
+  @Test
+  public void testSplitAndTransfer() throws Exception {
+    try (LargeListViewVector fromVector = LargeListViewVector.empty("sourceVector", allocator)) {
+
+      /* Explicitly add the dataVector */
+      MinorType type = MinorType.BIGINT;
+      fromVector.addOrGetVector(FieldType.nullable(type.getType()));
+
+      UnionLargeListViewWriter listViewWriter = fromVector.getWriter();
+
+      /* allocate memory */
+      listViewWriter.allocate();
+
+      /* populate data */
+      listViewWriter.setPosition(0);
+      listViewWriter.startListView();
+      listViewWriter.bigInt().writeBigInt(10);
+      listViewWriter.bigInt().writeBigInt(11);
+      listViewWriter.bigInt().writeBigInt(12);
+      listViewWriter.endListView();
+
+      listViewWriter.setPosition(1);
+      listViewWriter.startListView();
+      listViewWriter.bigInt().writeBigInt(13);
+      listViewWriter.bigInt().writeBigInt(14);
+      listViewWriter.endListView();
+
+      listViewWriter.setPosition(2);
+      listViewWriter.startListView();
+      listViewWriter.bigInt().writeBigInt(15);
+      listViewWriter.bigInt().writeBigInt(16);
+      listViewWriter.bigInt().writeBigInt(17);
+      listViewWriter.bigInt().writeBigInt(18);
+      listViewWriter.endListView();
+
+      listViewWriter.setPosition(3);
+      listViewWriter.startListView();
+      listViewWriter.bigInt().writeBigInt(19);
+      listViewWriter.endListView();
+
+      listViewWriter.setPosition(4);
+      listViewWriter.startListView();
+      listViewWriter.bigInt().writeBigInt(20);
+      listViewWriter.bigInt().writeBigInt(21);
+      listViewWriter.bigInt().writeBigInt(22);
+      listViewWriter.bigInt().writeBigInt(23);
+      listViewWriter.endListView();
+
+      fromVector.setValueCount(5);
+
+      /* get offset buffer */
+      final ArrowBuf offsetBuffer = fromVector.getOffsetBuffer();
+
+      /* get size buffer */
+      final ArrowBuf sizeBuffer = fromVector.getSizeBuffer();
+
+      /* get dataVector */
+      BigIntVector dataVector = (BigIntVector) fromVector.getDataVector();
+
+      /* check the vector output */
+
+      int index = 0;
+      int offset;
+      int size = 0;
+      Long actual;
+
+      /* index 0 */
+      assertFalse(fromVector.isNull(index));
+      offset = offsetBuffer.getInt(index * LargeListViewVector.OFFSET_WIDTH);
+      assertEquals(Integer.toString(0), Integer.toString(offset));
+
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(10), actual);
+      offset++;
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(11), actual);
+      offset++;
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(12), actual);
+      assertEquals(
+          Integer.toString(3),
+          Integer.toString(sizeBuffer.getInt(index * LargeListViewVector.SIZE_WIDTH)));
+
+      /* index 1 */
+      index++;
+      assertFalse(fromVector.isNull(index));
+      offset = offsetBuffer.getInt(index * LargeListViewVector.OFFSET_WIDTH);
+      assertEquals(Integer.toString(3), Integer.toString(offset));
+
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(13), actual);
+      offset++;
+      size++;
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(14), actual);
+      size++;
+      assertEquals(
+          Integer.toString(size),
+          Integer.toString(sizeBuffer.getInt(index * LargeListViewVector.SIZE_WIDTH)));
+
+      /* index 2 */
+      size = 0;
+      index++;
+      assertFalse(fromVector.isNull(index));
+      offset = offsetBuffer.getInt(index * LargeListViewVector.OFFSET_WIDTH);
+      assertEquals(Integer.toString(5), Integer.toString(offset));
+      size++;
+
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(15), actual);
+      offset++;
+      size++;
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(16), actual);
+      offset++;
+      size++;
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(17), actual);
+      offset++;
+      size++;
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(18), actual);
+      assertEquals(
+          Integer.toString(size),
+          Integer.toString(sizeBuffer.getInt(index * LargeListViewVector.SIZE_WIDTH)));
+
+      /* index 3 */
+      size = 0;
+      index++;
+      assertFalse(fromVector.isNull(index));
+      offset = offsetBuffer.getInt(index * LargeListViewVector.OFFSET_WIDTH);
+      assertEquals(Integer.toString(9), Integer.toString(offset));
+
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(19), actual);
+      size++;
+      assertEquals(
+          Integer.toString(size),
+          Integer.toString(sizeBuffer.getInt(index * LargeListViewVector.SIZE_WIDTH)));
+
+      /* index 4 */
+      size = 0;
+      index++;
+      assertFalse(fromVector.isNull(index));
+      offset = offsetBuffer.getInt(index * LargeListViewVector.OFFSET_WIDTH);
+      assertEquals(Integer.toString(10), Integer.toString(offset));
+
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(20), actual);
+      offset++;
+      size++;
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(21), actual);
+      offset++;
+      size++;
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(22), actual);
+      offset++;
+      size++;
+      actual = dataVector.getObject(offset);
+      assertEquals(Long.valueOf(23), actual);
+      size++;
+      assertEquals(
+          Integer.toString(size),
+          Integer.toString(sizeBuffer.getInt(index * LargeListViewVector.SIZE_WIDTH)));
+
+      /* do split and transfer */
+      try (LargeListViewVector toVector = LargeListViewVector.empty("toVector", allocator)) {
+        int[][] transferLengths = {{0, 2}, {3, 1}, {4, 1}};
+        TransferPair transferPair = fromVector.makeTransferPair(toVector);
+
+        for (final int[] transferLength : transferLengths) {
+          int start = transferLength[0];
+          int splitLength = transferLength[1];
+          validateSplitAndTransfer(transferPair, start, splitLength, fromVector, toVector);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void testGetTransferPairWithField() throws Exception {
+    try (final LargeListViewVector fromVector = LargeListViewVector.empty("listview", allocator)) {
+
+      UnionLargeListViewWriter writer = fromVector.getWriter();
+      writer.allocate();
+
+      // set some values
+      writer.startListView();
+      writer.integer().writeInt(1);
+      writer.integer().writeInt(2);
+      writer.endListView();
+      fromVector.setValueCount(2);
+
+      final TransferPair transferPair =
+          fromVector.getTransferPair(fromVector.getField(), allocator);
+      final LargeListViewVector toVector = (LargeListViewVector) transferPair.getTo();
+      // Field inside a new vector created by reusing a field should be the same in memory as the
+      // original field.
+      assertSame(toVector.getField(), fromVector.getField());
+    }
+  }
+
+  @Test
+  public void testOutOfOrderOffsetSplitAndTransfer() {
+    // [[12, -7, 25], null, [0, -127, 127, 50], [], [50, 12]]
+    try (LargeListViewVector fromVector = LargeListViewVector.empty("fromVector", allocator)) {
+      // Allocate buffers in LargeListViewVector by calling `allocateNew` method.
+      fromVector.allocateNew();
+
+      // Initialize the child vector using `initializeChildrenFromFields` method.
+
+      FieldType fieldType = new FieldType(true, new ArrowType.Int(64, true), null, null);
+      Field field = new Field("child-vector", fieldType, null);
+      fromVector.initializeChildrenFromFields(Collections.singletonList(field));
+
+      // Set values in the child vector.
+      FieldVector fieldVector = fromVector.getDataVector();
+      fieldVector.clear();
+
+      BigIntVector childVector = (BigIntVector) fieldVector;
+
+      childVector.allocateNew(7);
+
+      childVector.set(0, 0);
+      childVector.set(1, -127);
+      childVector.set(2, 127);
+      childVector.set(3, 50);
+      childVector.set(4, 12);
+      childVector.set(5, -7);
+      childVector.set(6, 25);
+
+      childVector.setValueCount(7);
+
+      // Set validity, offset and size buffers using `setValidity`,
+      //  `setOffset` and `setSize` methods.
+      fromVector.setValidity(0, 1);
+      fromVector.setValidity(1, 0);
+      fromVector.setValidity(2, 1);
+      fromVector.setValidity(3, 1);
+      fromVector.setValidity(4, 1);
+
+      fromVector.setOffset(0, 4);
+      fromVector.setOffset(1, 7);
+      fromVector.setOffset(2, 0);
+      fromVector.setOffset(3, 0);
+      fromVector.setOffset(4, 3);
+
+      fromVector.setSize(0, 3);
+      fromVector.setSize(1, 0);
+      fromVector.setSize(2, 4);
+      fromVector.setSize(3, 0);
+      fromVector.setSize(4, 2);
+
+      // Set value count using `setValueCount` method.
+      fromVector.setValueCount(5);
+
+      final ArrowBuf offSetBuffer = fromVector.getOffsetBuffer();
+      final ArrowBuf sizeBuffer = fromVector.getSizeBuffer();
+
+      // check offset buffer
+      assertEquals(4, offSetBuffer.getInt(0 * BaseLargeRepeatedValueViewVector.OFFSET_WIDTH));
+      assertEquals(7, offSetBuffer.getInt(1 * BaseLargeRepeatedValueViewVector.OFFSET_WIDTH));
+      assertEquals(0, offSetBuffer.getInt(2 * BaseLargeRepeatedValueViewVector.OFFSET_WIDTH));
+      assertEquals(0, offSetBuffer.getInt(3 * BaseLargeRepeatedValueViewVector.OFFSET_WIDTH));
+      assertEquals(3, offSetBuffer.getInt(4 * BaseLargeRepeatedValueViewVector.OFFSET_WIDTH));
+
+      // check size buffer
+      assertEquals(3, sizeBuffer.getInt(0 * BaseLargeRepeatedValueViewVector.SIZE_WIDTH));
+      assertEquals(0, sizeBuffer.getInt(1 * BaseLargeRepeatedValueViewVector.SIZE_WIDTH));
+      assertEquals(4, sizeBuffer.getInt(2 * BaseLargeRepeatedValueViewVector.SIZE_WIDTH));
+      assertEquals(0, sizeBuffer.getInt(3 * BaseLargeRepeatedValueViewVector.SIZE_WIDTH));
+      assertEquals(2, sizeBuffer.getInt(4 * BaseLargeRepeatedValueViewVector.SIZE_WIDTH));
+
+      // check child vector
+      assertEquals(0, ((BigIntVector) fromVector.getDataVector()).get(0));
+      assertEquals(-127, ((BigIntVector) fromVector.getDataVector()).get(1));
+      assertEquals(127, ((BigIntVector) fromVector.getDataVector()).get(2));
+      assertEquals(50, ((BigIntVector) fromVector.getDataVector()).get(3));
+      assertEquals(12, ((BigIntVector) fromVector.getDataVector()).get(4));
+      assertEquals(-7, ((BigIntVector) fromVector.getDataVector()).get(5));
+      assertEquals(25, ((BigIntVector) fromVector.getDataVector()).get(6));
+
+      // check values
+      Object result = fromVector.getObject(0);
+      ArrayList<Long> resultSet = (ArrayList<Long>) result;
+      assertEquals(3, resultSet.size());
+      assertEquals(Long.valueOf(12), resultSet.get(0));
+      assertEquals(Long.valueOf(-7), resultSet.get(1));
+      assertEquals(Long.valueOf(25), resultSet.get(2));
+
+      assertTrue(fromVector.isNull(1));
+
+      result = fromVector.getObject(2);
+      resultSet = (ArrayList<Long>) result;
+      assertEquals(4, resultSet.size());
+      assertEquals(Long.valueOf(0), resultSet.get(0));
+      assertEquals(Long.valueOf(-127), resultSet.get(1));
+      assertEquals(Long.valueOf(127), resultSet.get(2));
+      assertEquals(Long.valueOf(50), resultSet.get(3));
+
+      assertTrue(fromVector.isEmpty(3));
+
+      result = fromVector.getObject(4);
+      resultSet = (ArrayList<Long>) result;
+      assertEquals(2, resultSet.size());
+      assertEquals(Long.valueOf(50), resultSet.get(0));
+      assertEquals(Long.valueOf(12), resultSet.get(1));
+
+      fromVector.validate();
+
+      /* do split and transfer */
+      try (LargeListViewVector toVector = LargeListViewVector.empty("toVector", allocator)) {
+        int[][] transferLengths = {{2, 3}, {0, 1}, {0, 3}};
+        TransferPair transferPair = fromVector.makeTransferPair(toVector);
+
+        for (final int[] transferLength : transferLengths) {
+          int start = transferLength[0];
+          int splitLength = transferLength[1];
+          validateSplitAndTransfer(transferPair, start, splitLength, fromVector, toVector);
+        }
+      }
+    }
+  }
+
   private void writeIntValues(UnionLargeListViewWriter writer, int[] values) {
     writer.startListView();
     for (int v : values) {
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java
index d20dc3348b1..a3f25bc5207 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java
@@ -29,6 +29,7 @@
 import org.apache.arrow.memory.RootAllocator;
 import org.apache.arrow.vector.complex.DenseUnionVector;
 import org.apache.arrow.vector.complex.FixedSizeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.MapVector;
 import org.apache.arrow.vector.complex.StructVector;
@@ -852,6 +853,25 @@ public void testListVectorZeroStartIndexAndLength() {
     }
   }
 
+  @Test
+  public void testLargeListViewVectorZeroStartIndexAndLength() {
+    try (final LargeListViewVector listVector =
+            LargeListViewVector.empty("largelistview", allocator);
+        final LargeListViewVector newListVector = LargeListViewVector.empty("newList", allocator)) {
+
+      listVector.allocateNew();
+      final int valueCount = 0;
+      listVector.setValueCount(valueCount);
+
+      final TransferPair tp = listVector.makeTransferPair(newListVector);
+
+      tp.splitAndTransfer(0, 0);
+      assertEquals(valueCount, newListVector.getValueCount());
+
+      newListVector.clear();
+    }
+  }
+
   @Test
   public void testStructVectorZeroStartIndexAndLength() {
     Map<String, String> metadata = new HashMap<>();

From 712cfe6d84bd344cfe57a1e4c791f8a4d052c76d Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Wed, 14 Aug 2024 09:09:19 +0530
Subject: [PATCH 011/157] GH-43643: [Java] LargeListViewVector IPC Integration
 (#43681)

### Rationale for this change

Newly introduced `LargeListViewVector` requires the IPC integration for C Data integration tests while mainly supporting IPC format to include this type.

### What changes are included in this PR?

Includes the `JsonFileWriter` and `JsonFileReader` along with the corresponding test cases.

### Are these changes tested?

Yes, using existing tests but adding new configurations.

### Are there any user-facing changes?

No
* GitHub Issue: #43643

Authored-by: Vibhatha Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../apache/arrow/vector/ipc/JsonFileReader.java  |  9 +++++++--
 .../apache/arrow/vector/ipc/JsonFileWriter.java  | 16 ++++++++++++++--
 .../apache/arrow/vector/ipc/TestJSONFile.java    |  8 ++++++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java
index 626619a9483..5668325a87e 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java
@@ -73,6 +73,7 @@
 import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
 import org.apache.arrow.vector.types.Types.MinorType;
 import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.ArrowType.LargeListView;
 import org.apache.arrow.vector.types.pojo.ArrowType.ListView;
 import org.apache.arrow.vector.types.pojo.ArrowType.Union;
 import org.apache.arrow.vector.types.pojo.Field;
@@ -729,7 +730,8 @@ private List<ArrowBuf> readIntoBuffer(
     } else if (bufferType.equals(OFFSET) || bufferType.equals(SIZE)) {
       if (type == MinorType.LARGELIST
           || type == MinorType.LARGEVARCHAR
-          || type == MinorType.LARGEVARBINARY) {
+          || type == MinorType.LARGEVARBINARY
+          || type == MinorType.LARGELISTVIEW) {
         reader = helper.INT8;
       } else {
         reader = helper.INT4;
@@ -890,7 +892,10 @@ private void readFromJsonIntoVector(Field field, FieldVector vector) throws IOEx
         BufferType bufferType = vectorTypes.get(v);
         nextFieldIs(bufferType.getName());
         int innerBufferValueCount = valueCount;
-        if (bufferType.equals(OFFSET) && !(type instanceof Union) && !(type instanceof ListView)) {
+        if (bufferType.equals(OFFSET)
+            && !(type instanceof Union)
+            && !(type instanceof ListView)
+            && !(type instanceof LargeListView)) {
           /* offset buffer has 1 additional value capacity except for dense unions and ListView */
           innerBufferValueCount = valueCount + 1;
         }
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java
index 929c8c97c05..68700fe6afd 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java
@@ -73,6 +73,7 @@
 import org.apache.arrow.vector.UInt4Vector;
 import org.apache.arrow.vector.UInt8Vector;
 import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.complex.BaseLargeRepeatedValueViewVector;
 import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector;
 import org.apache.arrow.vector.dictionary.Dictionary;
 import org.apache.arrow.vector.dictionary.DictionaryProvider;
@@ -232,7 +233,8 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE
         final int bufferValueCount =
             (bufferType.equals(OFFSET)
                     && vector.getMinorType() != MinorType.DENSEUNION
-                    && vector.getMinorType() != MinorType.LISTVIEW)
+                    && vector.getMinorType() != MinorType.LISTVIEW
+                    && vector.getMinorType() != MinorType.LARGELISTVIEW)
                 ? valueCount + 1
                 : valueCount;
         for (int i = 0; i < bufferValueCount; i++) {
@@ -274,6 +276,7 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE
           } else if (bufferType.equals(OFFSET)
               && vector.getValueCount() == 0
               && (vector.getMinorType() == MinorType.LARGELIST
+                  || vector.getMinorType() == MinorType.LARGELISTVIEW
                   || vector.getMinorType() == MinorType.LARGEVARBINARY
                   || vector.getMinorType() == MinorType.LARGEVARCHAR)) {
             // Empty vectors may not have allocated an offsets buffer
@@ -427,6 +430,10 @@ private void writeValueToGenerator(
           generator.writeNumber(
               buffer.getInt((long) index * BaseRepeatedValueViewVector.OFFSET_WIDTH));
           break;
+        case LARGELISTVIEW:
+          generator.writeNumber(
+              buffer.getInt((long) index * BaseLargeRepeatedValueViewVector.OFFSET_WIDTH));
+          break;
         case LARGELIST:
         case LARGEVARBINARY:
         case LARGEVARCHAR:
@@ -582,7 +589,12 @@ private void writeValueToGenerator(
           throw new UnsupportedOperationException("minor type: " + vector.getMinorType());
       }
     } else if (bufferType.equals(SIZE)) {
-      generator.writeNumber(buffer.getInt((long) index * BaseRepeatedValueViewVector.SIZE_WIDTH));
+      if (vector.getMinorType() == MinorType.LISTVIEW) {
+        generator.writeNumber(buffer.getInt((long) index * BaseRepeatedValueViewVector.SIZE_WIDTH));
+      } else {
+        generator.writeNumber(
+            buffer.getInt((long) index * BaseLargeRepeatedValueViewVector.SIZE_WIDTH));
+      }
     }
   }
 
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java
index c69a3bfbc1e..8037212aaea 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java
@@ -437,10 +437,18 @@ public void testRoundtripEmptyVector() throws Exception {
                 "list",
                 FieldType.nullable(ArrowType.List.INSTANCE),
                 Collections.singletonList(Field.nullable("items", new ArrowType.Int(32, true)))),
+            new Field(
+                "listview",
+                FieldType.nullable(ArrowType.ListView.INSTANCE),
+                Collections.singletonList(Field.nullable("items", new ArrowType.Int(32, true)))),
             new Field(
                 "largelist",
                 FieldType.nullable(ArrowType.LargeList.INSTANCE),
                 Collections.singletonList(Field.nullable("items", new ArrowType.Int(32, true)))),
+            new Field(
+                "largelistview",
+                FieldType.nullable(ArrowType.LargeListView.INSTANCE),
+                Collections.singletonList(Field.nullable("items", new ArrowType.Int(32, true)))),
             new Field(
                 "map",
                 FieldType.nullable(new ArrowType.Map(/*keyssorted*/ false)),

From 7c8909a144f2e8d593dc8ad363ac95b2865b04ca Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 14 Aug 2024 14:27:07 +0200
Subject: [PATCH 012/157] MINOR: [Dev][C++] Allow ubuntu-cpp-thread-sanitizer
 Docker build with Ubuntu 24.04 (#43619)

Install the clang-rt libraries that are necessary to link Thread Sanitizer-enabled binaries. Also fix use of deprecated `BufferReader` constructor in some tests, so that compilation with CLang 18 succeeds.

Note that the C++ test suite still fails on Flight tests, as tracked in GH-36552.

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 ci/docker/ubuntu-24.04-cpp.dockerfile        | 1 +
 cpp/src/arrow/dataset/dataset_writer_test.cc | 2 +-
 cpp/src/arrow/io/compressed_test.cc          | 2 +-
 cpp/src/arrow/io/memory_test.cc              | 6 +++---
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile
index ecfb5e2f509..7d0772c33a2 100644
--- a/ci/docker/ubuntu-24.04-cpp.dockerfile
+++ b/ci/docker/ubuntu-24.04-cpp.dockerfile
@@ -57,6 +57,7 @@ RUN latest_system_llvm=18 && \
         clang-${llvm} \
         clang-format-${clang_tools} \
         clang-tidy-${clang_tools} \
+        libclang-rt-${llvm}-dev \
         llvm-${llvm}-dev && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists*
diff --git a/cpp/src/arrow/dataset/dataset_writer_test.cc b/cpp/src/arrow/dataset/dataset_writer_test.cc
index 871b6ef6f55..32ae8d7ee12 100644
--- a/cpp/src/arrow/dataset/dataset_writer_test.cc
+++ b/cpp/src/arrow/dataset/dataset_writer_test.cc
@@ -157,7 +157,7 @@ class DatasetWriterTestFixture : public testing::Test {
 
   std::shared_ptr<RecordBatch> ReadAsBatch(std::string_view data, int* num_batches) {
     std::shared_ptr<io::RandomAccessFile> in_stream =
-        std::make_shared<io::BufferReader>(data);
+        std::make_shared<io::BufferReader>(std::make_shared<Buffer>(data));
     EXPECT_OK_AND_ASSIGN(std::shared_ptr<ipc::RecordBatchFileReader> reader,
                          ipc::RecordBatchFileReader::Open(in_stream));
     RecordBatchVector batches;
diff --git a/cpp/src/arrow/io/compressed_test.cc b/cpp/src/arrow/io/compressed_test.cc
index 12d116e3395..7724c65e9dd 100644
--- a/cpp/src/arrow/io/compressed_test.cc
+++ b/cpp/src/arrow/io/compressed_test.cc
@@ -262,7 +262,7 @@ TEST_P(CompressedOutputStreamTest, RandomData) {
 TEST(TestSnappyInputStream, NotImplemented) {
   std::unique_ptr<Codec> codec;
   ASSERT_OK_AND_ASSIGN(codec, Codec::Create(Compression::SNAPPY));
-  std::shared_ptr<InputStream> stream = std::make_shared<BufferReader>("");
+  std::shared_ptr<InputStream> stream = BufferReader::FromString("");
   ASSERT_RAISES(NotImplemented, CompressedInputStream::Make(codec.get(), stream));
 }
 
diff --git a/cpp/src/arrow/io/memory_test.cc b/cpp/src/arrow/io/memory_test.cc
index bd898f17181..58f51ffa8d0 100644
--- a/cpp/src/arrow/io/memory_test.cc
+++ b/cpp/src/arrow/io/memory_test.cc
@@ -404,7 +404,7 @@ template <typename SlowStreamType>
 void TestSlowInputStream() {
   using clock = std::chrono::high_resolution_clock;
 
-  auto stream = std::make_shared<BufferReader>(std::string_view("abcdefghijkl"));
+  std::shared_ptr<RandomAccessFile> stream = BufferReader::FromString("abcdefghijkl");
   const double latency = 0.6;
   auto slow = std::make_shared<SlowStreamType>(stream, latency);
 
@@ -519,7 +519,7 @@ class TestTransformInputStream : public ::testing::Test {
   TransformInputStream::TransformFunc transform() const { return T(); }
 
   void TestEmptyStream() {
-    auto wrapped = std::make_shared<BufferReader>(std::string_view());
+    std::shared_ptr<InputStream> wrapped = BufferReader::FromString({});
     auto stream = std::make_shared<TransformInputStream>(wrapped, transform());
 
     ASSERT_OK_AND_EQ(0, stream->Tell());
@@ -797,7 +797,7 @@ TEST(RangeReadCache, Basics) {
 TEST(RangeReadCache, Concurrency) {
   std::string data = "abcdefghijklmnopqrstuvwxyz";
 
-  auto file = std::make_shared<BufferReader>(Buffer(data));
+  auto file = std::make_shared<BufferReader>(std::make_shared<Buffer>(data));
   std::vector<ReadRange> ranges{{1, 2},  {3, 2},  {8, 2},  {20, 2},
                                 {25, 0}, {10, 4}, {14, 0}, {15, 4}};
 

From ab432b1362208696e60824b45a5599a4e91e6301 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Wed, 14 Aug 2024 07:50:04 -0700
Subject: [PATCH 013/157] GH-43627: [R] Fix summarize() performance regression
 (pushdown) (#43649)

### Rationale for this change

See https://github.com/apache/arrow/issues/43627#issuecomment-2284259559

### What changes are included in this PR?

An extra `dplyr::select()`

### Are these changes tested?

Conbench should show that the performance is much better

### Are there any user-facing changes?

Not slow
* GitHub Issue: #43627
---
 r/R/dplyr-summarize.R | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index f4fda0f13aa..a9ad750de7c 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -43,6 +43,15 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) {
     hash = length(.data$group_by_vars) > 0
   )
 
+  # Do a projection here to keep only the columns we need in summarize().
+  # If possible, this will push down the column selection into the SourceNode,
+  # saving lots of wasted processing for columns we don't need. (GH-43627)
+  vars_to_keep <- unique(c(
+    unlist(lapply(exprs, all.vars)), # vars referenced in summarize
+    dplyr::group_vars(.data) # vars needed for grouping
+  ))
+  .data <- dplyr::select(.data, intersect(vars_to_keep, names(.data)))
+
   # nolint start
   # summarize() is complicated because you can do a mixture of scalar operations
   # and aggregations, but that's not how Acero works. For example, for us to do

From f518d6beb0c70f00688d08a3e70deff0d3c24c86 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 15 Aug 2024 10:41:08 +0200
Subject: [PATCH 014/157] GH-38041: [C++][CI] Improve IPC fuzzing seed corpus
 (#43621)

1. Add fuzz seeds with newer datatypes such as Run-End Encoded and String Views
2. Add fuzz seeds with buffer compression
3. Build seed corpus generation utilities even when fuzzing isn't enabled, for convenience

* GitHub Issue: #38041

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/ipc/CMakeLists.txt              |  7 ++-
 cpp/src/arrow/ipc/generate_fuzz_corpus.cc     | 44 +++++++++++++------
 .../arrow/ipc/generate_tensor_fuzz_corpus.cc  |  2 +-
 3 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt
index 2fc9b145ccc..9e0b1d723b9 100644
--- a/cpp/src/arrow/ipc/CMakeLists.txt
+++ b/cpp/src/arrow/ipc/CMakeLists.txt
@@ -71,7 +71,12 @@ endif()
 
 add_arrow_benchmark(read_write_benchmark PREFIX "arrow-ipc")
 
-if(ARROW_FUZZING)
+if(ARROW_FUZZING
+   OR (ARROW_BUILD_UTILITIES
+       AND ARROW_TESTING
+       AND ARROW_WITH_LZ4
+       AND ARROW_WITH_ZSTD
+      ))
   add_executable(arrow-ipc-generate-fuzz-corpus generate_fuzz_corpus.cc)
   target_link_libraries(arrow-ipc-generate-fuzz-corpus ${ARROW_UTIL_LIB}
                         ${ARROW_TEST_LINK_LIBS})
diff --git a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc
index 682c352132a..6ccf1155d12 100644
--- a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc
+++ b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc
@@ -33,11 +33,11 @@
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
 #include "arrow/testing/extension_type.h"
+#include "arrow/util/compression.h"
 #include "arrow/util/io_util.h"
 #include "arrow/util/key_value_metadata.h"
 
-namespace arrow {
-namespace ipc {
+namespace arrow::ipc {
 
 using ::arrow::internal::CreateDir;
 using ::arrow::internal::PlatformFilename;
@@ -88,6 +88,13 @@ Result<std::vector<std::shared_ptr<RecordBatch>>> Batches() {
   batches.push_back(batch);
   RETURN_NOT_OK(test::MakeFixedSizeListRecordBatch(&batch));
   batches.push_back(batch);
+  RETURN_NOT_OK(test::MakeStringTypesRecordBatch(&batch));
+  batches.push_back(batch);
+  RETURN_NOT_OK(test::MakeUuid(&batch));
+  batches.push_back(batch);
+  RETURN_NOT_OK(test::MakeRunEndEncoded(&batch));
+  batches.push_back(batch);
+
   ARROW_ASSIGN_OR_RAISE(batch, MakeExtensionBatch());
   batches.push_back(batch);
   ARROW_ASSIGN_OR_RAISE(batch, MakeMapBatch());
@@ -97,13 +104,14 @@ Result<std::vector<std::shared_ptr<RecordBatch>>> Batches() {
 }
 
 Result<std::shared_ptr<Buffer>> SerializeRecordBatch(
-    const std::shared_ptr<RecordBatch>& batch, bool is_stream_format) {
+    const std::shared_ptr<RecordBatch>& batch, const IpcWriteOptions& options,
+    bool is_stream_format) {
   ARROW_ASSIGN_OR_RAISE(auto sink, io::BufferOutputStream::Create(1024));
   std::shared_ptr<RecordBatchWriter> writer;
   if (is_stream_format) {
-    ARROW_ASSIGN_OR_RAISE(writer, MakeStreamWriter(sink, batch->schema()));
+    ARROW_ASSIGN_OR_RAISE(writer, MakeStreamWriter(sink, batch->schema(), options));
   } else {
-    ARROW_ASSIGN_OR_RAISE(writer, MakeFileWriter(sink, batch->schema()));
+    ARROW_ASSIGN_OR_RAISE(writer, MakeFileWriter(sink, batch->schema(), options));
   }
   RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
   RETURN_NOT_OK(writer->Close());
@@ -119,16 +127,27 @@ Status DoMain(bool is_stream_format, const std::string& out_dir) {
     return "batch-" + std::to_string(sample_num++);
   };
 
+  // codec 0 is uncompressed
+  std::vector<std::shared_ptr<util::Codec>> codecs(3, nullptr);
+  ARROW_ASSIGN_OR_RAISE(codecs[1], util::Codec::Create(Compression::LZ4_FRAME));
+  ARROW_ASSIGN_OR_RAISE(codecs[2], util::Codec::Create(Compression::ZSTD));
+
   ARROW_ASSIGN_OR_RAISE(auto batches, Batches());
 
+  // Emit a separate file for each (batch, codec) pair
   for (const auto& batch : batches) {
     RETURN_NOT_OK(batch->ValidateFull());
-    ARROW_ASSIGN_OR_RAISE(auto buf, SerializeRecordBatch(batch, is_stream_format));
-    ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
-    std::cerr << sample_fn.ToString() << std::endl;
-    ARROW_ASSIGN_OR_RAISE(auto file, io::FileOutputStream::Open(sample_fn.ToString()));
-    RETURN_NOT_OK(file->Write(buf));
-    RETURN_NOT_OK(file->Close());
+    for (const auto& codec : codecs) {
+      IpcWriteOptions options = IpcWriteOptions::Defaults();
+      options.codec = codec;
+      ARROW_ASSIGN_OR_RAISE(auto buf,
+                            SerializeRecordBatch(batch, options, is_stream_format));
+      ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
+      std::cerr << sample_fn.ToString() << std::endl;
+      ARROW_ASSIGN_OR_RAISE(auto file, io::FileOutputStream::Open(sample_fn.ToString()));
+      RETURN_NOT_OK(file->Write(buf));
+      RETURN_NOT_OK(file->Close());
+    }
   }
   return Status::OK();
 }
@@ -157,7 +176,6 @@ int Main(int argc, char** argv) {
   return 0;
 }
 
-}  // namespace ipc
-}  // namespace arrow
+}  // namespace arrow::ipc
 
 int main(int argc, char** argv) { return arrow::ipc::Main(argc, argv); }
diff --git a/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc b/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc
index dd40ef0ab2f..870f4586708 100644
--- a/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc
+++ b/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc
@@ -41,7 +41,7 @@ using ::arrow::internal::PlatformFilename;
 Result<PlatformFilename> PrepareDirectory(const std::string& dir) {
   ARROW_ASSIGN_OR_RAISE(auto dir_fn, PlatformFilename::FromString(dir));
   RETURN_NOT_OK(::arrow::internal::CreateDir(dir_fn));
-  return std::move(dir_fn);
+  return dir_fn;
 }
 
 Result<std::shared_ptr<Buffer>> MakeSerializedBuffer(

From 894f72f735c7074a40908bbc4d04bc4d07cbc3ea Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Thu, 15 Aug 2024 14:24:35 +0200
Subject: [PATCH 015/157] GH-43536: [Python] Do not use borrowed references
 APIs (#43540)

### Rationale for this change

For better reference safety under Python free-threaded builds (i.e. with the GIL removed), we should be using `Py(List|Dict)_GetItemRef` that return strong references and are implemented in a thread-safe manner.

### What changes are included in this PR?

- Vendor a copy of https://github.com/python/pythoncapi-compat
- Port to strong reference APIs for lists and dicts

### Are these changes tested?

I ran the tests with the free-threaded build before and after, and there's the same expected failures.

* GitHub Issue: #43536

Lead-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 dev/release/rat_exclude_files.txt             |    1 +
 .../pyarrow/src/arrow/python/CMakeLists.txt   |    1 +
 .../pyarrow/src/arrow/python/deserialize.cc   |   14 +-
 .../src/arrow/python/numpy_to_arrow.cc        |    7 +-
 .../src/arrow/python/python_to_arrow.cc       |   17 +-
 .../src/arrow/python/vendored/CMakeLists.txt  |   18 +
 .../arrow/python/vendored/pythoncapi_compat.h | 1519 +++++++++++++++++
 7 files changed, 1566 insertions(+), 11 deletions(-)
 create mode 100644 python/pyarrow/src/arrow/python/vendored/CMakeLists.txt
 create mode 100644 python/pyarrow/src/arrow/python/vendored/pythoncapi_compat.h

diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index ef325090f2f..e149c179813 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -95,6 +95,7 @@ python/manylinux1/.dockerignore
 python/pyarrow/includes/__init__.pxd
 python/pyarrow/tests/__init__.py
 python/pyarrow/vendored/*
+python/pyarrow/src/arrow/python/vendored/*
 python/requirements*.txt
 pax_global_header
 MANIFEST.in
diff --git a/python/pyarrow/src/arrow/python/CMakeLists.txt b/python/pyarrow/src/arrow/python/CMakeLists.txt
index ff355e46a4b..67508982eab 100644
--- a/python/pyarrow/src/arrow/python/CMakeLists.txt
+++ b/python/pyarrow/src/arrow/python/CMakeLists.txt
@@ -16,3 +16,4 @@
 # under the License.
 
 arrow_install_all_headers("arrow/python")
+add_subdirectory(vendored)
diff --git a/python/pyarrow/src/arrow/python/deserialize.cc b/python/pyarrow/src/arrow/python/deserialize.cc
index 961a1686e0a..ab300a182fa 100644
--- a/python/pyarrow/src/arrow/python/deserialize.cc
+++ b/python/pyarrow/src/arrow/python/deserialize.cc
@@ -46,6 +46,7 @@
 #include "arrow/python/numpy_convert.h"
 #include "arrow/python/pyarrow.h"
 #include "arrow/python/serialize.h"
+#include "arrow/python/vendored/pythoncapi_compat.h"
 
 namespace arrow {
 
@@ -88,8 +89,13 @@ Status DeserializeDict(PyObject* context, const Array& array, int64_t start_idx,
     // The latter two steal references whereas PyDict_SetItem does not. So we need
     // to make sure the reference count is decremented by letting the OwnedRef
     // go out of scope at the end.
-    int ret = PyDict_SetItem(result.obj(), PyList_GET_ITEM(keys.obj(), i - start_idx),
-                             PyList_GET_ITEM(vals.obj(), i - start_idx));
+    PyObject* key = PyList_GetItemRef(keys.obj(), i - start_idx);
+    RETURN_IF_PYERROR();
+    OwnedRef keyref(key);
+    PyObject* val = PyList_GetItemRef(vals.obj(), i - start_idx);
+    RETURN_IF_PYERROR();
+    OwnedRef valref(val);
+    int ret = PyDict_SetItem(result.obj(), key, val);
     if (ret != 0) {
       return ConvertPyError();
     }
@@ -398,7 +404,9 @@ Status GetSerializedFromComponents(int num_tensors,
 
   auto GetBuffer = [&data](Py_ssize_t index, std::shared_ptr<Buffer>* out) {
     ARROW_CHECK_LE(index, PyList_Size(data));
-    PyObject* py_buf = PyList_GET_ITEM(data, index);
+    PyObject* py_buf = PyList_GetItemRef(data, index);
+    RETURN_IF_PYERROR();
+    OwnedRef py_buf_ref(py_buf);
     return unwrap_buffer(py_buf).Value(out);
   };
 
diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc
index 460b1d0ce3f..e78a301bce3 100644
--- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc
@@ -57,6 +57,7 @@
 #include "arrow/python/numpy_internal.h"
 #include "arrow/python/python_to_arrow.h"
 #include "arrow/python/type_traits.h"
+#include "arrow/python/vendored/pythoncapi_compat.h"
 
 namespace arrow {
 
@@ -757,8 +758,10 @@ Status NumPyConverter::Visit(const StructType& type) {
     }
 
     for (auto field : type.fields()) {
-      PyObject* tup =
-          PyDict_GetItemString(PyDataType_FIELDS(dtype_), field->name().c_str());
+      PyObject* tup;
+      PyDict_GetItemStringRef(PyDataType_FIELDS(dtype_), field->name().c_str(), &tup);
+      RETURN_IF_PYERROR();
+      OwnedRef tupref(tup);
       if (tup == NULL) {
         return Status::Invalid("Missing field '", field->name(), "' in struct array");
       }
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index a2a325fde8d..ce9e15c894c 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -54,6 +54,7 @@
 #include "arrow/python/iterators.h"
 #include "arrow/python/numpy_convert.h"
 #include "arrow/python/type_traits.h"
+#include "arrow/python/vendored/pythoncapi_compat.h"
 #include "arrow/visit_type_inline.h"
 
 namespace arrow {
@@ -1107,11 +1108,13 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
   Status AppendDict(PyObject* dict, PyObject* field_names) {
     // NOTE we're ignoring any extraneous dict items
     for (int i = 0; i < num_fields_; i++) {
-      PyObject* name = PyList_GET_ITEM(field_names, i);  // borrowed
-      PyObject* value = PyDict_GetItem(dict, name);      // borrowed
-      if (value == NULL) {
-        RETURN_IF_PYERROR();
-      }
+      PyObject* name = PyList_GetItemRef(field_names, i);
+      RETURN_IF_PYERROR();
+      OwnedRef nameref(name);
+      PyObject* value;
+      PyDict_GetItemRef(dict, name, &value);
+      RETURN_IF_PYERROR();
+      OwnedRef valueref(value);
       RETURN_NOT_OK(this->children_[i]->Append(value ? value : Py_None));
     }
     return Status::OK();
@@ -1141,7 +1144,9 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
       ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i));
 
       // validate that the key and the field name are equal
-      PyObject* name = PyList_GET_ITEM(field_names, i);
+      PyObject* name = PyList_GetItemRef(field_names, i);
+      RETURN_IF_PYERROR();
+      OwnedRef nameref(name);
       bool are_equal = PyObject_RichCompareBool(pair.first, name, Py_EQ);
       RETURN_IF_PYERROR();
 
diff --git a/python/pyarrow/src/arrow/python/vendored/CMakeLists.txt b/python/pyarrow/src/arrow/python/vendored/CMakeLists.txt
new file mode 100644
index 00000000000..6190072c0d3
--- /dev/null
+++ b/python/pyarrow/src/arrow/python/vendored/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+arrow_install_all_headers("arrow/python/vendored")
diff --git a/python/pyarrow/src/arrow/python/vendored/pythoncapi_compat.h b/python/pyarrow/src/arrow/python/vendored/pythoncapi_compat.h
new file mode 100644
index 00000000000..4baa7b34a93
--- /dev/null
+++ b/python/pyarrow/src/arrow/python/vendored/pythoncapi_compat.h
@@ -0,0 +1,1519 @@
+// Header file providing new C API functions to old Python versions.
+//
+// File distributed under the Zero Clause BSD (0BSD) license.
+// Copyright Contributors to the pythoncapi_compat project.
+//
+// Homepage:
+// https://github.com/python/pythoncapi_compat
+//
+// Latest version:
+// https://raw.githubusercontent.com/python/pythoncapi_compat/master/pythoncapi_compat.h
+//
+// Vendored from git revision:
+// 39e2663e6acc0b68d5dd75bdaad0af33152552ae
+// https://raw.githubusercontent.com/python/pythoncapi-compat/39e2663e6acc0b68d5dd75bdaad0af33152552ae/pythoncapi_compat.h
+//
+// SPDX-License-Identifier: 0BSD
+
+/* clang-format off */
+
+#ifndef PYTHONCAPI_COMPAT
+#define PYTHONCAPI_COMPAT
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <Python.h>
+
+// Python 3.11.0b4 added PyFrame_Back() to Python.h
+#if PY_VERSION_HEX < 0x030b00B4 && !defined(PYPY_VERSION)
+#  include "frameobject.h"        // PyFrameObject, PyFrame_GetBack()
+#endif
+
+
+#ifndef _Py_CAST
+#  define _Py_CAST(type, expr) ((type)(expr))
+#endif
+
+// Static inline functions should use _Py_NULL rather than using directly NULL
+// to prevent C++ compiler warnings. On C23 and newer and on C++11 and newer,
+// _Py_NULL is defined as nullptr.
+#if (defined (__STDC_VERSION__) && __STDC_VERSION__ > 201710L) \
+        || (defined(__cplusplus) && __cplusplus >= 201103)
+#  define _Py_NULL nullptr
+#else
+#  define _Py_NULL NULL
+#endif
+
+// Cast argument to PyObject* type.
+#ifndef _PyObject_CAST
+#  define _PyObject_CAST(op) _Py_CAST(PyObject*, op)
+#endif
+
+
+// bpo-42262 added Py_NewRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3 && !defined(Py_NewRef)
+static inline PyObject* _Py_NewRef(PyObject *obj)
+{
+    Py_INCREF(obj);
+    return obj;
+}
+#define Py_NewRef(obj) _Py_NewRef(_PyObject_CAST(obj))
+#endif
+
+
+// bpo-42262 added Py_XNewRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3 && !defined(Py_XNewRef)
+static inline PyObject* _Py_XNewRef(PyObject *obj)
+{
+    Py_XINCREF(obj);
+    return obj;
+}
+#define Py_XNewRef(obj) _Py_XNewRef(_PyObject_CAST(obj))
+#endif
+
+
+// bpo-39573 added Py_SET_REFCNT() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_REFCNT)
+static inline void _Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt)
+{
+    ob->ob_refcnt = refcnt;
+}
+#define Py_SET_REFCNT(ob, refcnt) _Py_SET_REFCNT(_PyObject_CAST(ob), refcnt)
+#endif
+
+
+// Py_SETREF() and Py_XSETREF() were added to Python 3.5.2.
+// It is excluded from the limited C API.
+#if (PY_VERSION_HEX < 0x03050200 && !defined(Py_SETREF)) && !defined(Py_LIMITED_API)
+#define Py_SETREF(dst, src)                                     \
+    do {                                                        \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_dst = (*_tmp_dst_ptr);                   \
+        *_tmp_dst_ptr = _PyObject_CAST(src);                    \
+        Py_DECREF(_tmp_dst);                                    \
+    } while (0)
+
+#define Py_XSETREF(dst, src)                                    \
+    do {                                                        \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_dst = (*_tmp_dst_ptr);                   \
+        *_tmp_dst_ptr = _PyObject_CAST(src);                    \
+        Py_XDECREF(_tmp_dst);                                   \
+    } while (0)
+#endif
+
+
+// bpo-43753 added Py_Is(), Py_IsNone(), Py_IsTrue() and Py_IsFalse()
+// to Python 3.10.0b1.
+#if PY_VERSION_HEX < 0x030A00B1 && !defined(Py_Is)
+#  define Py_Is(x, y) ((x) == (y))
+#endif
+#if PY_VERSION_HEX < 0x030A00B1 && !defined(Py_IsNone)
+#  define Py_IsNone(x) Py_Is(x, Py_None)
+#endif
+#if (PY_VERSION_HEX < 0x030A00B1 || defined(PYPY_VERSION)) && !defined(Py_IsTrue)
+#  define Py_IsTrue(x) Py_Is(x, Py_True)
+#endif
+#if (PY_VERSION_HEX < 0x030A00B1 || defined(PYPY_VERSION)) && !defined(Py_IsFalse)
+#  define Py_IsFalse(x) Py_Is(x, Py_False)
+#endif
+
+
+// bpo-39573 added Py_SET_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
+static inline void _Py_SET_TYPE(PyObject *ob, PyTypeObject *type)
+{
+    ob->ob_type = type;
+}
+#define Py_SET_TYPE(ob, type) _Py_SET_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
+// bpo-39573 added Py_SET_SIZE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_SIZE)
+static inline void _Py_SET_SIZE(PyVarObject *ob, Py_ssize_t size)
+{
+    ob->ob_size = size;
+}
+#define Py_SET_SIZE(ob, size) _Py_SET_SIZE((PyVarObject*)(ob), size)
+#endif
+
+
+// bpo-40421 added PyFrame_GetCode() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 || defined(PYPY_VERSION)
+static inline PyCodeObject* PyFrame_GetCode(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    assert(frame->f_code != _Py_NULL);
+    return _Py_CAST(PyCodeObject*, Py_NewRef(frame->f_code));
+}
+#endif
+
+static inline PyCodeObject* _PyFrame_GetCodeBorrow(PyFrameObject *frame)
+{
+    PyCodeObject *code = PyFrame_GetCode(frame);
+    Py_DECREF(code);
+    return code;
+}
+
+
+// bpo-40421 added PyFrame_GetBack() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+static inline PyFrameObject* PyFrame_GetBack(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    return _Py_CAST(PyFrameObject*, Py_XNewRef(frame->f_back));
+}
+#endif
+
+#if !defined(PYPY_VERSION)
+static inline PyFrameObject* _PyFrame_GetBackBorrow(PyFrameObject *frame)
+{
+    PyFrameObject *back = PyFrame_GetBack(frame);
+    Py_XDECREF(back);
+    return back;
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetLocals() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetLocals(PyFrameObject *frame)
+{
+#if PY_VERSION_HEX >= 0x030400B1
+    if (PyFrame_FastToLocalsWithError(frame) < 0) {
+        return NULL;
+    }
+#else
+    PyFrame_FastToLocals(frame);
+#endif
+    return Py_NewRef(frame->f_locals);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetGlobals() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetGlobals(PyFrameObject *frame)
+{
+    return Py_NewRef(frame->f_globals);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetBuiltins() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetBuiltins(PyFrameObject *frame)
+{
+    return Py_NewRef(frame->f_builtins);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetLasti() to Python 3.11.0b1
+#if PY_VERSION_HEX < 0x030B00B1 && !defined(PYPY_VERSION)
+static inline int PyFrame_GetLasti(PyFrameObject *frame)
+{
+#if PY_VERSION_HEX >= 0x030A00A7
+    // bpo-27129: Since Python 3.10.0a7, f_lasti is an instruction offset,
+    // not a bytes offset anymore. Python uses 16-bit "wordcode" (2 bytes)
+    // instructions.
+    if (frame->f_lasti < 0) {
+        return -1;
+    }
+    return frame->f_lasti * 2;
+#else
+    return frame->f_lasti;
+#endif
+}
+#endif
+
+
+// gh-91248 added PyFrame_GetVar() to Python 3.12.0a2
+#if PY_VERSION_HEX < 0x030C00A2 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetVar(PyFrameObject *frame, PyObject *name)
+{
+    PyObject *locals, *value;
+
+    locals = PyFrame_GetLocals(frame);
+    if (locals == NULL) {
+        return NULL;
+    }
+#if PY_VERSION_HEX >= 0x03000000
+    value = PyDict_GetItemWithError(locals, name);
+#else
+    value = _PyDict_GetItemWithError(locals, name);
+#endif
+    Py_DECREF(locals);
+
+    if (value == NULL) {
+        if (PyErr_Occurred()) {
+            return NULL;
+        }
+#if PY_VERSION_HEX >= 0x03000000
+        PyErr_Format(PyExc_NameError, "variable %R does not exist", name);
+#else
+        PyErr_SetString(PyExc_NameError, "variable does not exist");
+#endif
+        return NULL;
+    }
+    return Py_NewRef(value);
+}
+#endif
+
+
+// gh-91248 added PyFrame_GetVarString() to Python 3.12.0a2
+#if PY_VERSION_HEX < 0x030C00A2 && !defined(PYPY_VERSION)
+static inline PyObject*
+PyFrame_GetVarString(PyFrameObject *frame, const char *name)
+{
+    PyObject *name_obj, *value;
+#if PY_VERSION_HEX >= 0x03000000
+    name_obj = PyUnicode_FromString(name);
+#else
+    name_obj = PyString_FromString(name);
+#endif
+    if (name_obj == NULL) {
+        return NULL;
+    }
+    value = PyFrame_GetVar(frame, name_obj);
+    Py_DECREF(name_obj);
+    return value;
+}
+#endif
+
+
+// bpo-39947 added PyThreadState_GetInterpreter() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
+static inline PyInterpreterState *
+PyThreadState_GetInterpreter(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->interp;
+}
+#endif
+
+
+// bpo-40429 added PyThreadState_GetFrame() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+static inline PyFrameObject* PyThreadState_GetFrame(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return _Py_CAST(PyFrameObject *, Py_XNewRef(tstate->frame));
+}
+#endif
+
+#if !defined(PYPY_VERSION)
+static inline PyFrameObject*
+_PyThreadState_GetFrameBorrow(PyThreadState *tstate)
+{
+    PyFrameObject *frame = PyThreadState_GetFrame(tstate);
+    Py_XDECREF(frame);
+    return frame;
+}
+#endif
+
+
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
+static inline PyInterpreterState* PyInterpreterState_Get(void)
+{
+    PyThreadState *tstate;
+    PyInterpreterState *interp;
+
+    tstate = PyThreadState_GET();
+    if (tstate == _Py_NULL) {
+        Py_FatalError("GIL released (tstate is NULL)");
+    }
+    interp = tstate->interp;
+    if (interp == _Py_NULL) {
+        Py_FatalError("no current interpreter");
+    }
+    return interp;
+}
+#endif
+
+
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a6
+#if 0x030700A1 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+static inline uint64_t PyThreadState_GetID(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->id;
+}
+#endif
+
+// bpo-43760 added PyThreadState_EnterTracing() to Python 3.11.0a2
+#if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
+static inline void PyThreadState_EnterTracing(PyThreadState *tstate)
+{
+    tstate->tracing++;
+#if PY_VERSION_HEX >= 0x030A00A1
+    tstate->cframe->use_tracing = 0;
+#else
+    tstate->use_tracing = 0;
+#endif
+}
+#endif
+
+// bpo-43760 added PyThreadState_LeaveTracing() to Python 3.11.0a2
+#if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
+static inline void PyThreadState_LeaveTracing(PyThreadState *tstate)
+{
+    int use_tracing = (tstate->c_tracefunc != _Py_NULL
+                       || tstate->c_profilefunc != _Py_NULL);
+    tstate->tracing--;
+#if PY_VERSION_HEX >= 0x030A00A1
+    tstate->cframe->use_tracing = use_tracing;
+#else
+    tstate->use_tracing = use_tracing;
+#endif
+}
+#endif
+
+
+// bpo-37194 added PyObject_CallNoArgs() to Python 3.9.0a1
+// PyObject_CallNoArgs() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallNoArgs) && PY_VERSION_HEX < 0x030900A1
+static inline PyObject* PyObject_CallNoArgs(PyObject *func)
+{
+    return PyObject_CallFunctionObjArgs(func, NULL);
+}
+#endif
+
+
+// bpo-39245 made PyObject_CallOneArg() public (previously called
+// _PyObject_CallOneArg) in Python 3.9.0a4
+// PyObject_CallOneArg() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallOneArg) && PY_VERSION_HEX < 0x030900A4
+static inline PyObject* PyObject_CallOneArg(PyObject *func, PyObject *arg)
+{
+    return PyObject_CallFunctionObjArgs(func, arg, NULL);
+}
+#endif
+
+
+// bpo-1635741 added PyModule_AddObjectRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3
+static inline int
+PyModule_AddObjectRef(PyObject *module, const char *name, PyObject *value)
+{
+    int res;
+
+    if (!value && !PyErr_Occurred()) {
+        // PyModule_AddObject() raises TypeError in this case
+        PyErr_SetString(PyExc_SystemError,
+                        "PyModule_AddObjectRef() must be called "
+                        "with an exception raised if value is NULL");
+        return -1;
+    }
+
+    Py_XINCREF(value);
+    res = PyModule_AddObject(module, name, value);
+    if (res < 0) {
+        Py_XDECREF(value);
+    }
+    return res;
+}
+#endif
+
+
+// bpo-40024 added PyModule_AddType() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5
+static inline int PyModule_AddType(PyObject *module, PyTypeObject *type)
+{
+    const char *name, *dot;
+
+    if (PyType_Ready(type) < 0) {
+        return -1;
+    }
+
+    // inline _PyType_Name()
+    name = type->tp_name;
+    assert(name != _Py_NULL);
+    dot = strrchr(name, '.');
+    if (dot != _Py_NULL) {
+        name = dot + 1;
+    }
+
+    return PyModule_AddObjectRef(module, name, _PyObject_CAST(type));
+}
+#endif
+
+
+// bpo-40241 added PyObject_GC_IsTracked() to Python 3.9.0a6.
+// bpo-4688 added _PyObject_GC_IS_TRACKED() to Python 2.7.0a2.
+#if PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+static inline int PyObject_GC_IsTracked(PyObject* obj)
+{
+    return (PyObject_IS_GC(obj) && _PyObject_GC_IS_TRACKED(obj));
+}
+#endif
+
+// bpo-40241 added PyObject_GC_IsFinalized() to Python 3.9.0a6.
+// bpo-18112 added _PyGCHead_FINALIZED() to Python 3.4.0 final.
+#if PY_VERSION_HEX < 0x030900A6 && PY_VERSION_HEX >= 0x030400F0 && !defined(PYPY_VERSION)
+static inline int PyObject_GC_IsFinalized(PyObject *obj)
+{
+    PyGC_Head *gc = _Py_CAST(PyGC_Head*, obj) - 1;
+    return (PyObject_IS_GC(obj) && _PyGCHead_FINALIZED(gc));
+}
+#endif
+
+
+// bpo-39573 added Py_IS_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_IS_TYPE)
+static inline int _Py_IS_TYPE(PyObject *ob, PyTypeObject *type) {
+    return Py_TYPE(ob) == type;
+}
+#define Py_IS_TYPE(ob, type) _Py_IS_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
+// bpo-46906 added PyFloat_Pack2() and PyFloat_Unpack2() to Python 3.11a7.
+// bpo-11734 added _PyFloat_Pack2() and _PyFloat_Unpack2() to Python 3.6.0b1.
+// Python 3.11a2 moved _PyFloat_Pack2() and _PyFloat_Unpack2() to the internal
+// C API: Python 3.11a2-3.11a6 versions are not supported.
+#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+static inline int PyFloat_Pack2(double x, char *p, int le)
+{ return _PyFloat_Pack2(x, (unsigned char*)p, le); }
+
+static inline double PyFloat_Unpack2(const char *p, int le)
+{ return _PyFloat_Unpack2((const unsigned char *)p, le); }
+#endif
+
+
+// bpo-46906 added PyFloat_Pack4(), PyFloat_Pack8(), PyFloat_Unpack4() and
+// PyFloat_Unpack8() to Python 3.11a7.
+// Python 3.11a2 moved _PyFloat_Pack4(), _PyFloat_Pack8(), _PyFloat_Unpack4()
+// and _PyFloat_Unpack8() to the internal C API: Python 3.11a2-3.11a6 versions
+// are not supported.
+#if PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+static inline int PyFloat_Pack4(double x, char *p, int le)
+{ return _PyFloat_Pack4(x, (unsigned char*)p, le); }
+
+static inline int PyFloat_Pack8(double x, char *p, int le)
+{ return _PyFloat_Pack8(x, (unsigned char*)p, le); }
+
+static inline double PyFloat_Unpack4(const char *p, int le)
+{ return _PyFloat_Unpack4((const unsigned char *)p, le); }
+
+static inline double PyFloat_Unpack8(const char *p, int le)
+{ return _PyFloat_Unpack8((const unsigned char *)p, le); }
+#endif
+
+
+// gh-92154 added PyCode_GetCode() to Python 3.11.0b1
+#if PY_VERSION_HEX < 0x030B00B1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetCode(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_code);
+}
+#endif
+
+
+// gh-95008 added PyCode_GetVarnames() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetVarnames(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_varnames);
+}
+#endif
+
+// gh-95008 added PyCode_GetFreevars() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetFreevars(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_freevars);
+}
+#endif
+
+// gh-95008 added PyCode_GetCellvars() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetCellvars(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_cellvars);
+}
+#endif
+
+
+// Py_UNUSED() was added to Python 3.4.0b2.
+#if PY_VERSION_HEX < 0x030400B2 && !defined(Py_UNUSED)
+#  if defined(__GNUC__) || defined(__clang__)
+#    define Py_UNUSED(name) _unused_ ## name __attribute__((unused))
+#  else
+#    define Py_UNUSED(name) _unused_ ## name
+#  endif
+#endif
+
+
+// gh-105922 added PyImport_AddModuleRef() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A0
+static inline PyObject* PyImport_AddModuleRef(const char *name)
+{
+    return Py_XNewRef(PyImport_AddModule(name));
+}
+#endif
+
+
+// gh-105927 added PyWeakref_GetRef() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D0000
+static inline int PyWeakref_GetRef(PyObject *ref, PyObject **pobj)
+{
+    PyObject *obj;
+    if (ref != NULL && !PyWeakref_Check(ref)) {
+        *pobj = NULL;
+        PyErr_SetString(PyExc_TypeError, "expected a weakref");
+        return -1;
+    }
+    obj = PyWeakref_GetObject(ref);
+    if (obj == NULL) {
+        // SystemError if ref is NULL
+        *pobj = NULL;
+        return -1;
+    }
+    if (obj == Py_None) {
+        *pobj = NULL;
+        return 0;
+    }
+    *pobj = Py_NewRef(obj);
+    return (*pobj != NULL);
+}
+#endif
+
+
+// bpo-36974 added PY_VECTORCALL_ARGUMENTS_OFFSET to Python 3.8b1
+#ifndef PY_VECTORCALL_ARGUMENTS_OFFSET
+#  define PY_VECTORCALL_ARGUMENTS_OFFSET (_Py_CAST(size_t, 1) << (8 * sizeof(size_t) - 1))
+#endif
+
+// bpo-36974 added PyVectorcall_NARGS() to Python 3.8b1
+#if PY_VERSION_HEX < 0x030800B1
+static inline Py_ssize_t PyVectorcall_NARGS(size_t n)
+{
+    return n & ~PY_VECTORCALL_ARGUMENTS_OFFSET;
+}
+#endif
+
+
+// gh-105922 added PyObject_Vectorcall() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4
+static inline PyObject*
+PyObject_Vectorcall(PyObject *callable, PyObject *const *args,
+                     size_t nargsf, PyObject *kwnames)
+{
+#if PY_VERSION_HEX >= 0x030800B1 && !defined(PYPY_VERSION)
+    // bpo-36974 added _PyObject_Vectorcall() to Python 3.8.0b1
+    return _PyObject_Vectorcall(callable, args, nargsf, kwnames);
+#else
+    PyObject *posargs = NULL, *kwargs = NULL;
+    PyObject *res;
+    Py_ssize_t nposargs, nkwargs, i;
+
+    if (nargsf != 0 && args == NULL) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+    if (kwnames != NULL && !PyTuple_Check(kwnames)) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+
+    nposargs = (Py_ssize_t)PyVectorcall_NARGS(nargsf);
+    if (kwnames) {
+        nkwargs = PyTuple_GET_SIZE(kwnames);
+    }
+    else {
+        nkwargs = 0;
+    }
+
+    posargs = PyTuple_New(nposargs);
+    if (posargs == NULL) {
+        goto error;
+    }
+    if (nposargs) {
+        for (i=0; i < nposargs; i++) {
+            PyTuple_SET_ITEM(posargs, i, Py_NewRef(*args));
+            args++;
+        }
+    }
+
+    if (nkwargs) {
+        kwargs = PyDict_New();
+        if (kwargs == NULL) {
+            goto error;
+        }
+
+        for (i = 0; i < nkwargs; i++) {
+            PyObject *key = PyTuple_GET_ITEM(kwnames, i);
+            PyObject *value = *args;
+            args++;
+            if (PyDict_SetItem(kwargs, key, value) < 0) {
+                goto error;
+            }
+        }
+    }
+    else {
+        kwargs = NULL;
+    }
+
+    res = PyObject_Call(callable, posargs, kwargs);
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return res;
+
+error:
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return NULL;
+#endif
+}
+#endif
+
+
+// gh-106521 added PyObject_GetOptionalAttr() and
+// PyObject_GetOptionalAttrString() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyObject_GetOptionalAttr(PyObject *obj, PyObject *attr_name, PyObject **result)
+{
+    // bpo-32571 added _PyObject_LookupAttr() to Python 3.7.0b1
+#if PY_VERSION_HEX >= 0x030700B1 && !defined(PYPY_VERSION)
+    return _PyObject_LookupAttr(obj, attr_name, result);
+#else
+    *result = PyObject_GetAttr(obj, attr_name);
+    if (*result != NULL) {
+        return 1;
+    }
+    if (!PyErr_Occurred()) {
+        return 0;
+    }
+    if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+        return 0;
+    }
+    return -1;
+#endif
+}
+
+static inline int
+PyObject_GetOptionalAttrString(PyObject *obj, const char *attr_name, PyObject **result)
+{
+    PyObject *name_obj;
+    int rc;
+#if PY_VERSION_HEX >= 0x03000000
+    name_obj = PyUnicode_FromString(attr_name);
+#else
+    name_obj = PyString_FromString(attr_name);
+#endif
+    if (name_obj == NULL) {
+        *result = NULL;
+        return -1;
+    }
+    rc = PyObject_GetOptionalAttr(obj, name_obj, result);
+    Py_DECREF(name_obj);
+    return rc;
+}
+#endif
+
+
+// gh-106307 added PyObject_GetOptionalAttr() and
+// PyMapping_GetOptionalItemString() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyMapping_GetOptionalItem(PyObject *obj, PyObject *key, PyObject **result)
+{
+    *result = PyObject_GetItem(obj, key);
+    if (*result) {
+        return 1;
+    }
+    if (!PyErr_ExceptionMatches(PyExc_KeyError)) {
+        return -1;
+    }
+    PyErr_Clear();
+    return 0;
+}
+
+static inline int
+PyMapping_GetOptionalItemString(PyObject *obj, const char *key, PyObject **result)
+{
+    PyObject *key_obj;
+    int rc;
+#if PY_VERSION_HEX >= 0x03000000
+    key_obj = PyUnicode_FromString(key);
+#else
+    key_obj = PyString_FromString(key);
+#endif
+    if (key_obj == NULL) {
+        *result = NULL;
+        return -1;
+    }
+    rc = PyMapping_GetOptionalItem(obj, key_obj, result);
+    Py_DECREF(key_obj);
+    return rc;
+}
+#endif
+
+// gh-108511 added PyMapping_HasKeyWithError() and
+// PyMapping_HasKeyStringWithError() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyMapping_HasKeyWithError(PyObject *obj, PyObject *key)
+{
+    PyObject *res;
+    int rc = PyMapping_GetOptionalItem(obj, key, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+
+static inline int
+PyMapping_HasKeyStringWithError(PyObject *obj, const char *key)
+{
+    PyObject *res;
+    int rc = PyMapping_GetOptionalItemString(obj, key, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+#endif
+
+
+// gh-108511 added PyObject_HasAttrWithError() and
+// PyObject_HasAttrStringWithError() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyObject_HasAttrWithError(PyObject *obj, PyObject *attr)
+{
+    PyObject *res;
+    int rc = PyObject_GetOptionalAttr(obj, attr, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+
+static inline int
+PyObject_HasAttrStringWithError(PyObject *obj, const char *attr)
+{
+    PyObject *res;
+    int rc = PyObject_GetOptionalAttrString(obj, attr, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+#endif
+
+
+// gh-106004 added PyDict_GetItemRef() and PyDict_GetItemStringRef()
+// to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyDict_GetItemRef(PyObject *mp, PyObject *key, PyObject **result)
+{
+#if PY_VERSION_HEX >= 0x03000000
+    PyObject *item = PyDict_GetItemWithError(mp, key);
+#else
+    PyObject *item = _PyDict_GetItemWithError(mp, key);
+#endif
+    if (item != NULL) {
+        *result = Py_NewRef(item);
+        return 1;  // found
+    }
+    if (!PyErr_Occurred()) {
+        *result = NULL;
+        return 0;  // not found
+    }
+    *result = NULL;
+    return -1;
+}
+
+static inline int
+PyDict_GetItemStringRef(PyObject *mp, const char *key, PyObject **result)
+{
+    int res;
+#if PY_VERSION_HEX >= 0x03000000
+    PyObject *key_obj = PyUnicode_FromString(key);
+#else
+    PyObject *key_obj = PyString_FromString(key);
+#endif
+    if (key_obj == NULL) {
+        *result = NULL;
+        return -1;
+    }
+    res = PyDict_GetItemRef(mp, key_obj, result);
+    Py_DECREF(key_obj);
+    return res;
+}
+#endif
+
+
+// gh-106307 added PyModule_Add() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyModule_Add(PyObject *mod, const char *name, PyObject *value)
+{
+    int res = PyModule_AddObjectRef(mod, name, value);
+    Py_XDECREF(value);
+    return res;
+}
+#endif
+
+
+// gh-108014 added Py_IsFinalizing() to Python 3.13.0a1
+// bpo-1856 added _Py_Finalizing to Python 3.2.1b1.
+// _Py_IsFinalizing() was added to PyPy 7.3.0.
+#if (0x030201B1 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030D00A1) \
+        && (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x7030000)
+static inline int Py_IsFinalizing(void)
+{
+#if PY_VERSION_HEX >= 0x030700A1
+    // _Py_IsFinalizing() was added to Python 3.7.0a1.
+    return _Py_IsFinalizing();
+#else
+    return (_Py_Finalizing != NULL);
+#endif
+}
+#endif
+
+
+// gh-108323 added PyDict_ContainsString() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int PyDict_ContainsString(PyObject *op, const char *key)
+{
+    PyObject *key_obj = PyUnicode_FromString(key);
+    if (key_obj == NULL) {
+        return -1;
+    }
+    int res = PyDict_Contains(op, key_obj);
+    Py_DECREF(key_obj);
+    return res;
+}
+#endif
+
+
+// gh-108445 added PyLong_AsInt() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int PyLong_AsInt(PyObject *obj)
+{
+#ifdef PYPY_VERSION
+    long value = PyLong_AsLong(obj);
+    if (value == -1 && PyErr_Occurred()) {
+        return -1;
+    }
+    if (value < (long)INT_MIN || (long)INT_MAX < value) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "Python int too large to convert to C int");
+        return -1;
+    }
+    return (int)value;
+#else
+    return _PyLong_AsInt(obj);
+#endif
+}
+#endif
+
+
+// gh-107073 added PyObject_VisitManagedDict() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyObject_VisitManagedDict(PyObject *obj, visitproc visit, void *arg)
+{
+    PyObject **dict = _PyObject_GetDictPtr(obj);
+    if (*dict == NULL) {
+        return -1;
+    }
+    Py_VISIT(*dict);
+    return 0;
+}
+
+static inline void
+PyObject_ClearManagedDict(PyObject *obj)
+{
+    PyObject **dict = _PyObject_GetDictPtr(obj);
+    if (*dict == NULL) {
+        return;
+    }
+    Py_CLEAR(*dict);
+}
+#endif
+
+// gh-108867 added PyThreadState_GetUnchecked() to Python 3.13.0a1
+// Python 3.5.2 added _PyThreadState_UncheckedGet().
+#if PY_VERSION_HEX >= 0x03050200 && PY_VERSION_HEX < 0x030D00A1
+static inline PyThreadState*
+PyThreadState_GetUnchecked(void)
+{
+    return _PyThreadState_UncheckedGet();
+}
+#endif
+
+// gh-110289 added PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize()
+// to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t str_len)
+{
+    Py_ssize_t len;
+    const void *utf8;
+    PyObject *exc_type, *exc_value, *exc_tb;
+    int res;
+
+    // API cannot report errors so save/restore the exception
+    PyErr_Fetch(&exc_type, &exc_value, &exc_tb);
+
+    // Python 3.3.0a1 added PyUnicode_AsUTF8AndSize()
+#if PY_VERSION_HEX >= 0x030300A1
+    if (PyUnicode_IS_ASCII(unicode)) {
+        utf8 = PyUnicode_DATA(unicode);
+        len = PyUnicode_GET_LENGTH(unicode);
+    }
+    else {
+        utf8 = PyUnicode_AsUTF8AndSize(unicode, &len);
+        if (utf8 == NULL) {
+            // Memory allocation failure. The API cannot report error,
+            // so ignore the exception and return 0.
+            res = 0;
+            goto done;
+        }
+    }
+
+    if (len != str_len) {
+        res = 0;
+        goto done;
+    }
+    res = (memcmp(utf8, str, (size_t)len) == 0);
+#else
+    PyObject *bytes = PyUnicode_AsUTF8String(unicode);
+    if (bytes == NULL) {
+        // Memory allocation failure. The API cannot report error,
+        // so ignore the exception and return 0.
+        res = 0;
+        goto done;
+    }
+
+#if PY_VERSION_HEX >= 0x03000000
+    len = PyBytes_GET_SIZE(bytes);
+    utf8 = PyBytes_AS_STRING(bytes);
+#else
+    len = PyString_GET_SIZE(bytes);
+    utf8 = PyString_AS_STRING(bytes);
+#endif
+    if (len != str_len) {
+        Py_DECREF(bytes);
+        res = 0;
+        goto done;
+    }
+
+    res = (memcmp(utf8, str, (size_t)len) == 0);
+    Py_DECREF(bytes);
+#endif
+
+done:
+    PyErr_Restore(exc_type, exc_value, exc_tb);
+    return res;
+}
+
+static inline int
+PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
+{
+    return PyUnicode_EqualToUTF8AndSize(unicode, str, (Py_ssize_t)strlen(str));
+}
+#endif
+
+
+// gh-111138 added PyList_Extend() and PyList_Clear() to Python 3.13.0a2
+#if PY_VERSION_HEX < 0x030D00A2
+static inline int
+PyList_Extend(PyObject *list, PyObject *iterable)
+{
+    return PyList_SetSlice(list, PY_SSIZE_T_MAX, PY_SSIZE_T_MAX, iterable);
+}
+
+static inline int
+PyList_Clear(PyObject *list)
+{
+    return PyList_SetSlice(list, 0, PY_SSIZE_T_MAX, NULL);
+}
+#endif
+
+// gh-111262 added PyDict_Pop() and PyDict_PopString() to Python 3.13.0a2
+#if PY_VERSION_HEX < 0x030D00A2
+static inline int
+PyDict_Pop(PyObject *dict, PyObject *key, PyObject **result)
+{
+    PyObject *value;
+
+    if (!PyDict_Check(dict)) {
+        PyErr_BadInternalCall();
+        if (result) {
+            *result = NULL;
+        }
+        return -1;
+    }
+
+    // bpo-16991 added _PyDict_Pop() to Python 3.5.0b2.
+    // Python 3.6.0b3 changed _PyDict_Pop() first argument type to PyObject*.
+    // Python 3.13.0a1 removed _PyDict_Pop().
+#if defined(PYPY_VERSION) || PY_VERSION_HEX < 0x030500b2 || PY_VERSION_HEX >= 0x030D0000
+    value = PyObject_CallMethod(dict, "pop", "O", key);
+#elif PY_VERSION_HEX < 0x030600b3
+    value = _PyDict_Pop(_Py_CAST(PyDictObject*, dict), key, NULL);
+#else
+    value = _PyDict_Pop(dict, key, NULL);
+#endif
+    if (value == NULL) {
+        if (result) {
+            *result = NULL;
+        }
+        if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_KeyError)) {
+            return -1;
+        }
+        PyErr_Clear();
+        return 0;
+    }
+    if (result) {
+        *result = value;
+    }
+    else {
+        Py_DECREF(value);
+    }
+    return 1;
+}
+
+static inline int
+PyDict_PopString(PyObject *dict, const char *key, PyObject **result)
+{
+    PyObject *key_obj = PyUnicode_FromString(key);
+    if (key_obj == NULL) {
+        if (result != NULL) {
+            *result = NULL;
+        }
+        return -1;
+    }
+
+    int res = PyDict_Pop(dict, key_obj, result);
+    Py_DECREF(key_obj);
+    return res;
+}
+#endif
+
+
+#if PY_VERSION_HEX < 0x030200A4
+// Python 3.2.0a4 added Py_hash_t type
+typedef Py_ssize_t Py_hash_t;
+#endif
+
+
+// gh-111545 added Py_HashPointer() to Python 3.13.0a3
+#if PY_VERSION_HEX < 0x030D00A3
+static inline Py_hash_t Py_HashPointer(const void *ptr)
+{
+#if PY_VERSION_HEX >= 0x030900A4 && !defined(PYPY_VERSION)
+    return _Py_HashPointer(ptr);
+#else
+    return _Py_HashPointer(_Py_CAST(void*, ptr));
+#endif
+}
+#endif
+
+
+// Python 3.13a4 added a PyTime API.
+// Use the private API added to Python 3.5.
+#if PY_VERSION_HEX < 0x030D00A4 && PY_VERSION_HEX  >= 0x03050000
+typedef _PyTime_t PyTime_t;
+#define PyTime_MIN _PyTime_MIN
+#define PyTime_MAX _PyTime_MAX
+
+static inline double PyTime_AsSecondsDouble(PyTime_t t)
+{ return _PyTime_AsSecondsDouble(t); }
+
+static inline int PyTime_Monotonic(PyTime_t *result)
+{ return _PyTime_GetMonotonicClockWithInfo(result, NULL); }
+
+static inline int PyTime_Time(PyTime_t *result)
+{ return _PyTime_GetSystemClockWithInfo(result, NULL); }
+
+static inline int PyTime_PerfCounter(PyTime_t *result)
+{
+#if PY_VERSION_HEX >= 0x03070000 && !defined(PYPY_VERSION)
+    return _PyTime_GetPerfCounterWithInfo(result, NULL);
+#elif PY_VERSION_HEX >= 0x03070000
+    // Call time.perf_counter_ns() and convert Python int object to PyTime_t.
+    // Cache time.perf_counter_ns() function for best performance.
+    static PyObject *func = NULL;
+    if (func == NULL) {
+        PyObject *mod = PyImport_ImportModule("time");
+        if (mod == NULL) {
+            return -1;
+        }
+
+        func = PyObject_GetAttrString(mod, "perf_counter_ns");
+        Py_DECREF(mod);
+        if (func == NULL) {
+            return -1;
+        }
+    }
+
+    PyObject *res = PyObject_CallNoArgs(func);
+    if (res == NULL) {
+        return -1;
+    }
+    long long value = PyLong_AsLongLong(res);
+    Py_DECREF(res);
+
+    if (value == -1 && PyErr_Occurred()) {
+        return -1;
+    }
+
+    Py_BUILD_ASSERT(sizeof(value) >= sizeof(PyTime_t));
+    *result = (PyTime_t)value;
+    return 0;
+#else
+    // Call time.perf_counter() and convert C double to PyTime_t.
+    // Cache time.perf_counter() function for best performance.
+    static PyObject *func = NULL;
+    if (func == NULL) {
+        PyObject *mod = PyImport_ImportModule("time");
+        if (mod == NULL) {
+            return -1;
+        }
+
+        func = PyObject_GetAttrString(mod, "perf_counter");
+        Py_DECREF(mod);
+        if (func == NULL) {
+            return -1;
+        }
+    }
+
+    PyObject *res = PyObject_CallNoArgs(func);
+    if (res == NULL) {
+        return -1;
+    }
+    double d = PyFloat_AsDouble(res);
+    Py_DECREF(res);
+
+    if (d == -1.0 && PyErr_Occurred()) {
+        return -1;
+    }
+
+    // Avoid floor() to avoid having to link to libm
+    *result = (PyTime_t)(d * 1e9);
+    return 0;
+#endif
+}
+
+#endif
+
+// gh-111389 added hash constants to Python 3.13.0a5. These constants were
+// added first as private macros to Python 3.4.0b1 and PyPy 7.3.9.
+#if (!defined(PyHASH_BITS) \
+     && ((!defined(PYPY_VERSION) && PY_VERSION_HEX >= 0x030400B1) \
+         || (defined(PYPY_VERSION) && PY_VERSION_HEX >= 0x03070000 \
+             && PYPY_VERSION_NUM >= 0x07090000)))
+#  define PyHASH_BITS _PyHASH_BITS
+#  define PyHASH_MODULUS _PyHASH_MODULUS
+#  define PyHASH_INF _PyHASH_INF
+#  define PyHASH_IMAG _PyHASH_IMAG
+#endif
+
+
+// gh-111545 added Py_GetConstant() and Py_GetConstantBorrowed()
+// to Python 3.13.0a6
+#if PY_VERSION_HEX < 0x030D00A6 && !defined(Py_CONSTANT_NONE)
+
+#define Py_CONSTANT_NONE 0
+#define Py_CONSTANT_FALSE 1
+#define Py_CONSTANT_TRUE 2
+#define Py_CONSTANT_ELLIPSIS 3
+#define Py_CONSTANT_NOT_IMPLEMENTED 4
+#define Py_CONSTANT_ZERO 5
+#define Py_CONSTANT_ONE 6
+#define Py_CONSTANT_EMPTY_STR 7
+#define Py_CONSTANT_EMPTY_BYTES 8
+#define Py_CONSTANT_EMPTY_TUPLE 9
+
+static inline PyObject* Py_GetConstant(unsigned int constant_id)
+{
+    static PyObject* constants[Py_CONSTANT_EMPTY_TUPLE + 1] = {NULL};
+
+    if (constants[Py_CONSTANT_NONE] == NULL) {
+        constants[Py_CONSTANT_NONE] = Py_None;
+        constants[Py_CONSTANT_FALSE] = Py_False;
+        constants[Py_CONSTANT_TRUE] = Py_True;
+        constants[Py_CONSTANT_ELLIPSIS] = Py_Ellipsis;
+        constants[Py_CONSTANT_NOT_IMPLEMENTED] = Py_NotImplemented;
+
+        constants[Py_CONSTANT_ZERO] = PyLong_FromLong(0);
+        if (constants[Py_CONSTANT_ZERO] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_ONE] = PyLong_FromLong(1);
+        if (constants[Py_CONSTANT_ONE] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_EMPTY_STR] = PyUnicode_FromStringAndSize("", 0);
+        if (constants[Py_CONSTANT_EMPTY_STR] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_EMPTY_BYTES] = PyBytes_FromStringAndSize("", 0);
+        if (constants[Py_CONSTANT_EMPTY_BYTES] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_EMPTY_TUPLE] = PyTuple_New(0);
+        if (constants[Py_CONSTANT_EMPTY_TUPLE] == NULL) {
+            goto fatal_error;
+        }
+        // goto dance to avoid compiler warnings about Py_FatalError()
+        goto init_done;
+
+fatal_error:
+        // This case should never happen
+        Py_FatalError("Py_GetConstant() failed to get constants");
+    }
+
+init_done:
+    if (constant_id <= Py_CONSTANT_EMPTY_TUPLE) {
+        return Py_NewRef(constants[constant_id]);
+    }
+    else {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+}
+
+static inline PyObject* Py_GetConstantBorrowed(unsigned int constant_id)
+{
+    PyObject *obj = Py_GetConstant(constant_id);
+    Py_XDECREF(obj);
+    return obj;
+}
+#endif
+
+
+// gh-114329 added PyList_GetItemRef() to Python 3.13.0a4
+#if PY_VERSION_HEX < 0x030D00A4
+static inline PyObject *
+PyList_GetItemRef(PyObject *op, Py_ssize_t index)
+{
+    PyObject *item = PyList_GetItem(op, index);
+    Py_XINCREF(item);
+    return item;
+}
+#endif
+
+
+// gh-114329 added PyList_GetItemRef() to Python 3.13.0a4
+#if PY_VERSION_HEX < 0x030D00A4
+static inline int
+PyDict_SetDefaultRef(PyObject *d, PyObject *key, PyObject *default_value,
+                     PyObject **result)
+{
+    PyObject *value;
+    if (PyDict_GetItemRef(d, key, &value) < 0) {
+        // get error
+        if (result) {
+            *result = NULL;
+        }
+        return -1;
+    }
+    if (value != NULL) {
+        // present
+        if (result) {
+            *result = value;
+        }
+        else {
+            Py_DECREF(value);
+        }
+        return 1;
+    }
+
+    // missing: set the item
+    if (PyDict_SetItem(d, key, default_value) < 0) {
+        // set error
+        if (result) {
+            *result = NULL;
+        }
+        return -1;
+    }
+    if (result) {
+        *result = Py_NewRef(default_value);
+    }
+    return 0;
+}
+#endif
+
+#if PY_VERSION_HEX < 0x030E0000 && PY_VERSION_HEX >= 0x03060000 && !defined(PYPY_VERSION)
+typedef struct PyUnicodeWriter PyUnicodeWriter;
+
+static inline void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
+{
+    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
+    PyMem_Free(writer);
+}
+
+static inline PyUnicodeWriter* PyUnicodeWriter_Create(Py_ssize_t length)
+{
+    if (length < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "length must be positive");
+        return NULL;
+    }
+
+    const size_t size = sizeof(_PyUnicodeWriter);
+    PyUnicodeWriter *pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
+    if (pub_writer == _Py_NULL) {
+        PyErr_NoMemory();
+        return _Py_NULL;
+    }
+    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
+
+    _PyUnicodeWriter_Init(writer);
+    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
+        PyUnicodeWriter_Discard(pub_writer);
+        return NULL;
+    }
+    writer->overallocate = 1;
+    return pub_writer;
+}
+
+static inline PyObject* PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
+{
+    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
+    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
+    PyMem_Free(writer);
+    return str;
+}
+
+static inline int
+PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
+{
+    if (ch > 0x10ffff) {
+        PyErr_SetString(PyExc_ValueError,
+                        "character must be in range(0x110000)");
+        return -1;
+    }
+
+    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
+}
+
+static inline int
+PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
+{
+    PyObject *str = PyObject_Str(obj);
+    if (str == NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
+    Py_DECREF(str);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
+{
+    PyObject *str = PyObject_Repr(obj);
+    if (str == NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
+    Py_DECREF(str);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
+                          const char *str, Py_ssize_t size)
+{
+    if (size < 0) {
+        size = (Py_ssize_t)strlen(str);
+    }
+
+    PyObject *str_obj = PyUnicode_FromStringAndSize(str, size);
+    if (str_obj == _Py_NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str_obj);
+    Py_DECREF(str_obj);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer,
+                              const wchar_t *str, Py_ssize_t size)
+{
+    if (size < 0) {
+        size = (Py_ssize_t)wcslen(str);
+    }
+
+    PyObject *str_obj = PyUnicode_FromWideChar(str, size);
+    if (str_obj == _Py_NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str_obj);
+    Py_DECREF(str_obj);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
+                               Py_ssize_t start, Py_ssize_t end)
+{
+    if (!PyUnicode_Check(str)) {
+        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
+        return -1;
+    }
+    if (start < 0 || start > end) {
+        PyErr_Format(PyExc_ValueError, "invalid start argument");
+        return -1;
+    }
+    if (end > PyUnicode_GET_LENGTH(str)) {
+        PyErr_Format(PyExc_ValueError, "invalid end argument");
+        return -1;
+    }
+
+    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
+                                           start, end);
+}
+
+static inline int
+PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
+{
+    va_list vargs;
+    va_start(vargs, format);
+    PyObject *str = PyUnicode_FromFormatV(format, vargs);
+    va_end(vargs);
+    if (str == _Py_NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
+    Py_DECREF(str);
+    return res;
+}
+#endif  // PY_VERSION_HEX < 0x030E0000
+
+// gh-116560 added PyLong_GetSign() to Python 3.14.0a0
+#if PY_VERSION_HEX < 0x030E00A0
+static inline int PyLong_GetSign(PyObject *obj, int *sign)
+{
+    if (!PyLong_Check(obj)) {
+        PyErr_Format(PyExc_TypeError, "expect int, got %s", Py_TYPE(obj)->tp_name);
+        return -1;
+    }
+
+    *sign = _PyLong_Sign(obj);
+    return 0;
+}
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // PYTHONCAPI_COMPAT

From fb202ee66d73572f46035c5b2f21ac22f74ba951 Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Thu, 15 Aug 2024 21:04:39 +0800
Subject: [PATCH 016/157] GH-43703: [C++][Parquet][CI] Parquet: Introducing
 more bad_data for testing (#43708)

### Rationale for this change

Introducing more bad_data for testing

### What changes are included in this PR?

* Upgrade parquet-testing
* Introduce more bad_data
* Update fuzz generation

### Are these changes tested?

They're tests :-)

### Are there any user-facing changes?

no

* GitHub Issue: #43703

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/build-support/fuzzing/generate_corpuses.sh    |  1 +
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 12 +++++++++---
 cpp/submodules/parquet-testing                    |  2 +-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cpp/build-support/fuzzing/generate_corpuses.sh b/cpp/build-support/fuzzing/generate_corpuses.sh
index e3f00e64782..ffd5c54e443 100755
--- a/cpp/build-support/fuzzing/generate_corpuses.sh
+++ b/cpp/build-support/fuzzing/generate_corpuses.sh
@@ -56,4 +56,5 @@ rm -rf ${CORPUS_DIR}
 ${OUT}/parquet-arrow-generate-fuzz-corpus ${CORPUS_DIR}
 # Add Parquet testing examples
 cp ${ARROW_CPP}/submodules/parquet-testing/data/*.parquet ${CORPUS_DIR}
+cp ${ARROW_CPP}/submodules/parquet-testing/bad_data/*.parquet ${CORPUS_DIR}
 ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/parquet-arrow-fuzz_seed_corpus.zip
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index aad1e933c4f..64030e0f90d 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -5298,14 +5298,20 @@ TEST(TestArrowReadWrite, MultithreadedWrite) {
 
 TEST(TestArrowReadWrite, FuzzReader) {
   constexpr size_t kMaxFileSize = 1024 * 1024 * 1;
-  {
-    auto path = test::get_data_file("PARQUET-1481.parquet", /*is_good=*/false);
+  auto check_bad_file = [&](const std::string& file_name) {
+    SCOPED_TRACE(file_name);
+    auto path = test::get_data_file(file_name, /*is_good=*/false);
     PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
                                              path, ::arrow::io::FileMode::READ));
     PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
     auto s = internal::FuzzReader(buffer->data(), buffer->size());
     ASSERT_NOT_OK(s);
-  }
+  };
+  check_bad_file("PARQUET-1481.parquet");
+  check_bad_file("ARROW-GH-41317.parquet");
+  check_bad_file("ARROW-GH-41321.parquet");
+  check_bad_file("ARROW-RS-GH-6229-LEVELS.parquet");
+  check_bad_file("ARROW-RS-GH-6229-DICTHEADER.parquet");
   {
     auto path = test::get_data_file("alltypes_plain.parquet", /*is_good=*/true);
     PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index 74278bc4a11..cb7a9674142 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit 74278bc4a1122d74945969e6dec405abd1533ec3
+Subproject commit cb7a9674142c137367bf75a01b79c6e214a73199

From dfe6c50cf81a6893e44b1e2056301bfdfc2be48b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 15 Aug 2024 07:46:35 -0700
Subject: [PATCH 017/157] MINOR: [C#] Bump BenchmarkDotNet and
 System.Runtime.CompilerServices.Unsafe in /csharp (#43651)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [BenchmarkDotNet](https://github.com/dotnet/BenchmarkDotNet) and [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime). These dependencies needed to be updated together.
Updates `BenchmarkDotNet` from 0.13.12 to 0.14.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/dotnet/BenchmarkDotNet/releases">BenchmarkDotNet's releases</a>.</em></p>
<blockquote>
<h2>0.14.0</h2>
<p>Full changelog: <a href="https://benchmarkdotnet.org/changelog/v0.14.0.html">https://benchmarkdotnet.org/changelog/v0.14.0.html</a></p>
<h2>Highlights</h2>
<ul>
<li>Introduce <code>BenchmarkDotNet.Diagnostics.dotMemory</code> <a href="https://redirect.github.com/dotnet/BenchmarkDotNet/pull/2549">#2549</a>: memory allocation profile of your benchmarks using <a href="https://www.jetbrains.com/dotmemory/">dotMemory</a>, see <a href="https://github.com/BenchmarkDotNet"><code>@​BenchmarkDotNet</code></a>.Samples.IntroDotMemoryDiagnoser</li>
<li>Introduce <code>BenchmarkDotNet.Exporters.Plotting</code> <a href="https://redirect.github.com/dotnet/BenchmarkDotNet/pull/2560">#2560</a>: plotting via <a href="https://scottplot.net/">ScottPlot</a> (initial version)</li>
<li>Multiple bugfixes</li>
<li>The default build toolchains have been updated to pass <code>IntermediateOutputPath</code>, <code>OutputPath</code>, and <code>OutDir</code> properties to the <code>dotnet build</code> command. This change forces all build outputs to be placed in a new directory generated by BenchmarkDotNet, and fixes many issues that have been reported with builds. You can also access these paths in your own <code>.csproj</code> and <code>.props</code> from those properties if you need to copy custom files to the output.</li>
</ul>
<h2>Bug fixes</h2>
<ul>
<li>Fixed multiple build-related bugs including passing MsBuildArguments and .Net 8's <code>UseArtifactsOutput</code>.</li>
</ul>
<h2>Breaking Changes</h2>
<ul>
<li><code>DotNetCliBuilder</code> removed <code>retryFailedBuildWithNoDeps</code> constructor option.</li>
<li><code>DotNetCliCommand</code> removed <code>RetryFailedBuildWithNoDeps</code> property and <code>BuildNoRestoreNoDependencies()</code> and <code>PublishNoBuildAndNoRestore()</code> methods (replaced with <code>PublishNoRestore()</code>).</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/cf882d378d51a6998aad43ca9caa29c19d122b87"><code>cf882d3</code></a> Add macOS Sequoia in OsBrandStringHelper</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/17cf3b0a71b7fa41e83e2db16307219420f4a4f8"><code>17cf3b0</code></a> [docs] Prepare v0.14.0 changelog</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/b3fbe7c489c2b6e354f736ba4c0854e4f1daacfb"><code>b3fbe7c</code></a> Set next BenchmarkDotNet version: 0.14.0</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/23e6c523cfe638d53508d6ca8212ca23501049ce"><code>23e6c52</code></a> Fix InvalidOperationException in DotMemoryDiagnoser</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/3d34edb219b84a68a377cb38b833dd30241fd5c8"><code>3d34edb</code></a> Bump JetBrains.Profiler.SelfApi: 2.5.2-&gt;2.5.9</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/bf0a49d1f5756cd2f7cb1da56974a7ee6a5a6fdf"><code>bf0a49d</code></a> fix(CI): Deprecation issues (<a href="https://redirect.github.com/dotnet/BenchmarkDotNet/issues/2605">#2605</a>)</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/0275649d350bcdc6953215598eca775b4882ece5"><code>0275649</code></a> ﻿Fixed crash from TaskbarProgress when BuiltInComInteropSupport is disabled. ...</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/15200d46a1395ef6e69c39c6f3371ab0e0d96e5c"><code>15200d4</code></a> [build] Add BenchmarkDotNet.Exporters.Plotting.Tests to unit-tests</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/834417a7dbec1dbb22a99cbb5f45c9cd474e483e"><code>834417a</code></a> Improve logging in ScottPlotExporterTests</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/f8082a2138b7cf1bda1eab8dca98d7d3c43b9946"><code>f8082a2</code></a> Fix IntroSummaryStyle compilation</li>
<li>Additional commits viewable in <a href="https://github.com/dotnet/BenchmarkDotNet/compare/v0.13.12...v0.14.0">compare view</a></li>
</ul>
</details>
<br />

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 5.0.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/dotnet/runtime/releases">System.Runtime.CompilerServices.Unsafe's releases</a>.</em></p>
<blockquote>
<h2>.NET 5</h2>
<p><a href="https://github.com/dotnet/core/blob/master/release-notes/5.0/5.0.0/5.0.0.md">Release Notes</a>
<a href="https://github.com/dotnet/core/blob/master/release-notes/5.0/5.0.0/5.0.0-install-instructions.md">Install Instructions</a></p>
<h1>Repo</h1>
<ul>
<li><a href="https://github.com/dotnet/core/releases/tag/v5.0.0">Core</a></li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a href="https://github.com/dotnet/runtime/commits/v5.0.0">compare view</a></li>
</ul>
</details>
<br />

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj
index f735f01b022..5bf51f5c305 100644
--- a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj
+++ b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj
@@ -6,7 +6,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
+    <PackageReference Include="BenchmarkDotNet" Version="0.14.0" />
     <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.13.12" />
   </ItemGroup>
 

From 8b634ad2998b6a670cc7d4d3ef0e43dea3b7aca1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 15 Aug 2024 08:07:56 -0700
Subject: [PATCH 018/157] MINOR: [C#] Bump BenchmarkDotNet.Diagnostics.Windows
 and System.Runtime.CompilerServices.Unsafe in /csharp (#43711)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [BenchmarkDotNet.Diagnostics.Windows](https://github.com/dotnet/BenchmarkDotNet) and [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime). These dependencies needed to be updated together.
Updates `BenchmarkDotNet.Diagnostics.Windows` from 0.13.12 to 0.14.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/dotnet/BenchmarkDotNet/releases">BenchmarkDotNet.Diagnostics.Windows's releases</a>.</em></p>
<blockquote>
<h2>0.14.0</h2>
<p>Full changelog: <a href="https://benchmarkdotnet.org/changelog/v0.14.0.html">https://benchmarkdotnet.org/changelog/v0.14.0.html</a></p>
<h2>Highlights</h2>
<ul>
<li>Introduce <code>BenchmarkDotNet.Diagnostics.dotMemory</code> <a href="https://redirect.github.com/dotnet/BenchmarkDotNet/pull/2549">#2549</a>: memory allocation profile of your benchmarks using <a href="https://www.jetbrains.com/dotmemory/">dotMemory</a>, see <a href="https://github.com/BenchmarkDotNet"><code>@​BenchmarkDotNet</code></a>.Samples.IntroDotMemoryDiagnoser</li>
<li>Introduce <code>BenchmarkDotNet.Exporters.Plotting</code> <a href="https://redirect.github.com/dotnet/BenchmarkDotNet/pull/2560">#2560</a>: plotting via <a href="https://scottplot.net/">ScottPlot</a> (initial version)</li>
<li>Multiple bugfixes</li>
<li>The default build toolchains have been updated to pass <code>IntermediateOutputPath</code>, <code>OutputPath</code>, and <code>OutDir</code> properties to the <code>dotnet build</code> command. This change forces all build outputs to be placed in a new directory generated by BenchmarkDotNet, and fixes many issues that have been reported with builds. You can also access these paths in your own <code>.csproj</code> and <code>.props</code> from those properties if you need to copy custom files to the output.</li>
</ul>
<h2>Bug fixes</h2>
<ul>
<li>Fixed multiple build-related bugs including passing MsBuildArguments and .Net 8's <code>UseArtifactsOutput</code>.</li>
</ul>
<h2>Breaking Changes</h2>
<ul>
<li><code>DotNetCliBuilder</code> removed <code>retryFailedBuildWithNoDeps</code> constructor option.</li>
<li><code>DotNetCliCommand</code> removed <code>RetryFailedBuildWithNoDeps</code> property and <code>BuildNoRestoreNoDependencies()</code> and <code>PublishNoBuildAndNoRestore()</code> methods (replaced with <code>PublishNoRestore()</code>).</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/cf882d378d51a6998aad43ca9caa29c19d122b87"><code>cf882d3</code></a> Add macOS Sequoia in OsBrandStringHelper</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/17cf3b0a71b7fa41e83e2db16307219420f4a4f8"><code>17cf3b0</code></a> [docs] Prepare v0.14.0 changelog</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/b3fbe7c489c2b6e354f736ba4c0854e4f1daacfb"><code>b3fbe7c</code></a> Set next BenchmarkDotNet version: 0.14.0</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/23e6c523cfe638d53508d6ca8212ca23501049ce"><code>23e6c52</code></a> Fix InvalidOperationException in DotMemoryDiagnoser</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/3d34edb219b84a68a377cb38b833dd30241fd5c8"><code>3d34edb</code></a> Bump JetBrains.Profiler.SelfApi: 2.5.2-&gt;2.5.9</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/bf0a49d1f5756cd2f7cb1da56974a7ee6a5a6fdf"><code>bf0a49d</code></a> fix(CI): Deprecation issues (<a href="https://redirect.github.com/dotnet/BenchmarkDotNet/issues/2605">#2605</a>)</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/0275649d350bcdc6953215598eca775b4882ece5"><code>0275649</code></a> ﻿Fixed crash from TaskbarProgress when BuiltInComInteropSupport is disabled. ...</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/15200d46a1395ef6e69c39c6f3371ab0e0d96e5c"><code>15200d4</code></a> [build] Add BenchmarkDotNet.Exporters.Plotting.Tests to unit-tests</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/834417a7dbec1dbb22a99cbb5f45c9cd474e483e"><code>834417a</code></a> Improve logging in ScottPlotExporterTests</li>
<li><a href="https://github.com/dotnet/BenchmarkDotNet/commit/f8082a2138b7cf1bda1eab8dca98d7d3c43b9946"><code>f8082a2</code></a> Fix IntroSummaryStyle compilation</li>
<li>Additional commits viewable in <a href="https://github.com/dotnet/BenchmarkDotNet/compare/v0.13.12...v0.14.0">compare view</a></li>
</ul>
</details>
<br />

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 5.0.0
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a href="https://github.com/dotnet/runtime/commits/v5.0.0">compare view</a></li>
</ul>
</details>
<br />

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj
index 5bf51f5c305..0a3e3341041 100644
--- a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj
+++ b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj
@@ -7,7 +7,7 @@
 
   <ItemGroup>
     <PackageReference Include="BenchmarkDotNet" Version="0.14.0" />
-    <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.13.12" />
+    <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.14.0" />
   </ItemGroup>
 
   <ItemGroup>

From 2e434dad9b0cdcc57524dc2a0cc7f7b3ed23ccc4 Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Fri, 16 Aug 2024 00:10:51 +0800
Subject: [PATCH 019/157] GH-43687: [C++] Compute: fix register kernel
 SimdLevel for AddMinMax512AggKernels (#43704)

### Rationale for this change

See https://github.com/apache/arrow/issues/43687

### What changes are included in this PR?

Change Registered AVX2 to AVX512

### Are these changes tested?

No

### Are there any user-facing changes?

maybe bugfix

* GitHub Issue: #43687

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc
index 0d66ed2ec3e..05356e0aa5e 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc
@@ -80,8 +80,8 @@ void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func) {
   AddMinMaxKernels(MinMaxInitAvx512, {int32(), uint32(), int64(), uint64()}, func,
                    SimdLevel::AVX512);
   AddMinMaxKernels(MinMaxInitAvx512, TemporalTypes(), func, SimdLevel::AVX512);
-  AddMinMaxKernels(MinMaxInitAvx512, BaseBinaryTypes(), func, SimdLevel::AVX2);
-  AddMinMaxKernel(MinMaxInitAvx512, Type::FIXED_SIZE_BINARY, func, SimdLevel::AVX2);
+  AddMinMaxKernels(MinMaxInitAvx512, BaseBinaryTypes(), func, SimdLevel::AVX512);
+  AddMinMaxKernel(MinMaxInitAvx512, Type::FIXED_SIZE_BINARY, func, SimdLevel::AVX512);
   AddMinMaxKernel(MinMaxInitAvx512, Type::INTERVAL_MONTHS, func, SimdLevel::AVX512);
 }
 

From 2767dc55cb41377af6895141f717475d73b2892d Mon Sep 17 00:00:00 2001
From: Chungmin Lee <cmlee650@gmail.com>
Date: Thu, 15 Aug 2024 09:32:22 -0700
Subject: [PATCH 020/157] GH-41579: [C++][Python][Parquet] Support
 reading/writing key-value metadata from/to ColumnChunkMetaData (#41580)

### Rationale for this change
Parquet standard allows reading/writing key-value metadata from/to ColumnChunkMetaData, but there is no way to do that with Parquet C++.

### What changes are included in this PR?
Support reading/writing key-value metadata from/to ColumnChunkMetaData with Parquet C++ reader/writer. Support reading key-value metadata from ColumnChunkMetaData with pyarrow.parquet.

### Are these changes tested?
Yes, unit tests are added

### Are there any user-facing changes?
Yes.
- Users can read or write key-value metadata for column chunks with Parquet C++.
- Users can read key-value metadata for column chunks with PyArrow.
- parquet-reader tool prints key-value metadata in column chunks when `--print-key-value-metadata` option is used.

* GitHub Issue: #41579

Lead-authored-by: Chungmin Lee <chungminlee@microsoft.com>
Co-authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 cpp/src/parquet/column_writer.cc              | 24 ++++++
 cpp/src/parquet/column_writer.h               | 12 +++
 cpp/src/parquet/column_writer_test.cc         | 69 +++++++++++++++
 cpp/src/parquet/metadata.cc                   | 84 ++++++++++++++-----
 cpp/src/parquet/metadata.h                    |  5 ++
 cpp/src/parquet/printer.cc                    | 32 +++++--
 python/pyarrow/_parquet.pxd                   |  1 +
 python/pyarrow/_parquet.pyx                   | 13 +++
 python/pyarrow/tests/parquet/conftest.py      | 12 +++
 python/pyarrow/tests/parquet/test_metadata.py |  9 ++
 10 files changed, 235 insertions(+), 26 deletions(-)

diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index f859ec9653f..40d19d38e10 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -40,6 +40,7 @@
 #include "arrow/util/crc32.h"
 #include "arrow/util/endian.h"
 #include "arrow/util/float16.h"
+#include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/rle_encoding_internal.h"
 #include "arrow/util/type_traits.h"
@@ -832,6 +833,9 @@ class ColumnWriterImpl {
   void FlushBufferedDataPages();
 
   ColumnChunkMetaDataBuilder* metadata_;
+  // key_value_metadata_ for the column chunk
+  // It would be nullptr if there is no KeyValueMetadata set.
+  std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
   const ColumnDescriptor* descr_;
   // scratch buffer if validity bits need to be recalculated.
   std::shared_ptr<ResizableBuffer> bits_buffer_;
@@ -1100,6 +1104,7 @@ int64_t ColumnWriterImpl::Close() {
     if (rows_written_ > 0 && chunk_statistics.is_set()) {
       metadata_->SetStatistics(chunk_statistics);
     }
+    metadata_->SetKeyValueMetadata(key_value_metadata_);
     pager_->Close(has_dictionary_, fallback_);
   }
 
@@ -1397,6 +1402,25 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
     return pages_change_on_record_boundaries_;
   }
 
+  void AddKeyValueMetadata(
+      const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) override {
+    if (closed_) {
+      throw ParquetException("Cannot add key-value metadata to closed column");
+    }
+    if (key_value_metadata_ == nullptr) {
+      key_value_metadata_ = key_value_metadata;
+    } else if (key_value_metadata != nullptr) {
+      key_value_metadata_ = key_value_metadata_->Merge(*key_value_metadata);
+    }
+  }
+
+  void ResetKeyValueMetadata() override {
+    if (closed_) {
+      throw ParquetException("Cannot add key-value metadata to closed column");
+    }
+    key_value_metadata_ = nullptr;
+  }
+
  private:
   using ValueEncoderType = typename EncodingTraits<DType>::Encoder;
   using TypedStats = TypedStatistics<DType>;
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index a278670fa81..845bf9aa896 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -21,6 +21,7 @@
 #include <cstring>
 #include <memory>
 
+#include "arrow/type_fwd.h"
 #include "arrow/util/compression.h"
 #include "parquet/exception.h"
 #include "parquet/platform.h"
@@ -181,6 +182,17 @@ class PARQUET_EXPORT ColumnWriter {
   /// \brief The file-level writer properties
   virtual const WriterProperties* properties() = 0;
 
+  /// \brief Add key-value metadata to the ColumnChunk.
+  /// \param[in] key_value_metadata the metadata to add.
+  /// \note This will overwrite any existing metadata with the same key.
+  /// \throw ParquetException if Close() has been called.
+  virtual void AddKeyValueMetadata(
+      const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata) = 0;
+
+  /// \brief Reset the ColumnChunk key-value metadata.
+  /// \throw ParquetException if Close() has been called.
+  virtual void ResetKeyValueMetadata() = 0;
+
   /// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
   /// error status if the array data type is not compatible with the concrete
   /// writer type.
diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index c99efd17961..d2b3aa0dff0 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -23,10 +23,12 @@
 #include <gtest/gtest.h>
 
 #include "arrow/io/buffered.h"
+#include "arrow/io/file.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_builders.h"
 #include "arrow/util/config.h"
+#include "arrow/util/key_value_metadata.h"
 
 #include "parquet/column_page.h"
 #include "parquet/column_reader.h"
@@ -51,6 +53,9 @@ using schema::PrimitiveNode;
 
 namespace test {
 
+using ::testing::IsNull;
+using ::testing::NotNull;
+
 // The default size used in most tests.
 const int SMALL_SIZE = 100;
 #ifdef PARQUET_VALGRIND
@@ -385,6 +390,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
     return metadata_accessor->encoding_stats();
   }
 
+  std::shared_ptr<const KeyValueMetadata> metadata_key_value_metadata() {
+    // Metadata accessor must be created lazily.
+    // This is because the ColumnChunkMetaData semantics dictate the metadata object is
+    // complete (no changes to the metadata buffer can be made after instantiation)
+    auto metadata_accessor =
+        ColumnChunkMetaData::Make(metadata_->contents(), this->descr_);
+    return metadata_accessor->key_value_metadata();
+  }
+
  protected:
   int64_t values_read_;
   // Keep the reader alive as for ByteArray the lifetime of the ByteArray
@@ -1705,5 +1719,60 @@ TEST(TestColumnWriter, WriteDataPageV2HeaderNullCount) {
   }
 }
 
+using TestInt32Writer = TestPrimitiveWriter<Int32Type>;
+
+TEST_F(TestInt32Writer, NoWriteKeyValueMetadata) {
+  auto writer = this->BuildWriter();
+  writer->Close();
+  auto key_value_metadata = metadata_key_value_metadata();
+  ASSERT_THAT(key_value_metadata, IsNull());
+}
+
+TEST_F(TestInt32Writer, WriteKeyValueMetadata) {
+  auto writer = this->BuildWriter();
+  writer->AddKeyValueMetadata(
+      KeyValueMetadata::Make({"hello", "bye"}, {"world", "earth"}));
+  // overwrite the previous value
+  writer->AddKeyValueMetadata(KeyValueMetadata::Make({"bye"}, {"moon"}));
+  writer->Close();
+  auto key_value_metadata = metadata_key_value_metadata();
+  ASSERT_THAT(key_value_metadata, NotNull());
+  ASSERT_EQ(2, key_value_metadata->size());
+  ASSERT_OK_AND_ASSIGN(auto value, key_value_metadata->Get("hello"));
+  ASSERT_EQ("world", value);
+  ASSERT_OK_AND_ASSIGN(value, key_value_metadata->Get("bye"));
+  ASSERT_EQ("moon", value);
+}
+
+TEST_F(TestInt32Writer, ResetKeyValueMetadata) {
+  auto writer = this->BuildWriter();
+  writer->AddKeyValueMetadata(KeyValueMetadata::Make({"hello"}, {"world"}));
+  writer->ResetKeyValueMetadata();
+  writer->Close();
+  auto key_value_metadata = metadata_key_value_metadata();
+  ASSERT_THAT(key_value_metadata, IsNull());
+}
+
+TEST_F(TestInt32Writer, WriteKeyValueMetadataEndToEnd) {
+  auto sink = CreateOutputStream();
+  {
+    auto file_writer = ParquetFileWriter::Open(
+        sink, std::dynamic_pointer_cast<schema::GroupNode>(schema_.schema_root()));
+    auto rg_writer = file_writer->AppendRowGroup();
+    auto col_writer = rg_writer->NextColumn();
+    col_writer->AddKeyValueMetadata(KeyValueMetadata::Make({"foo"}, {"bar"}));
+    file_writer->Close();
+  }
+  ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
+  auto file_reader =
+      ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
+  auto key_value_metadata =
+      file_reader->metadata()->RowGroup(0)->ColumnChunk(0)->key_value_metadata();
+  ASSERT_THAT(key_value_metadata, NotNull());
+  ASSERT_EQ(1U, key_value_metadata->size());
+  ASSERT_OK_AND_ASSIGN(auto value, key_value_metadata->Get("foo"));
+  ASSERT_EQ("bar", value);
+}
+
 }  // namespace test
 }  // namespace parquet
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 10c8afaf375..4f2aa6e3732 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -135,6 +135,39 @@ std::shared_ptr<Statistics> MakeColumnStats(const format::ColumnMetaData& meta_d
   throw ParquetException("Can't decode page statistics for selected column type");
 }
 
+// Get KeyValueMetadata from parquet Thrift RowGroup or ColumnChunk metadata.
+//
+// Returns nullptr if the metadata is not set.
+template <typename Metadata>
+std::shared_ptr<KeyValueMetadata> FromThriftKeyValueMetadata(const Metadata& source) {
+  std::shared_ptr<KeyValueMetadata> metadata = nullptr;
+  if (source.__isset.key_value_metadata) {
+    std::vector<std::string> keys;
+    std::vector<std::string> values;
+    keys.reserve(source.key_value_metadata.size());
+    values.reserve(source.key_value_metadata.size());
+    for (const auto& it : source.key_value_metadata) {
+      keys.push_back(it.key);
+      values.push_back(it.value);
+    }
+    metadata = std::make_shared<KeyValueMetadata>(std::move(keys), std::move(values));
+  }
+  return metadata;
+}
+
+template <typename Metadata>
+void ToThriftKeyValueMetadata(const KeyValueMetadata& source, Metadata* metadata) {
+  std::vector<format::KeyValue> key_value_metadata;
+  key_value_metadata.reserve(static_cast<size_t>(source.size()));
+  for (int64_t i = 0; i < source.size(); ++i) {
+    format::KeyValue kv_pair;
+    kv_pair.__set_key(source.key(i));
+    kv_pair.__set_value(source.value(i));
+    key_value_metadata.emplace_back(std::move(kv_pair));
+  }
+  metadata->__set_key_value_metadata(std::move(key_value_metadata));
+}
+
 // MetaData Accessor
 
 // ColumnCryptoMetaData
@@ -233,6 +266,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
                                  encoding_stats.count});
     }
     possible_stats_ = nullptr;
+    InitKeyValueMetadata();
   }
 
   bool Equals(const ColumnChunkMetaDataImpl& other) const {
@@ -343,7 +377,15 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
     return std::nullopt;
   }
 
+  const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
+    return key_value_metadata_;
+  }
+
  private:
+  void InitKeyValueMetadata() {
+    key_value_metadata_ = FromThriftKeyValueMetadata(*column_metadata_);
+  }
+
   mutable std::shared_ptr<Statistics> possible_stats_;
   std::vector<Encoding::type> encodings_;
   std::vector<PageEncodingStats> encoding_stats_;
@@ -353,6 +395,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
   const ColumnDescriptor* descr_;
   const ReaderProperties properties_;
   const ApplicationVersion* writer_version_;
+  std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
 };
 
 std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(
@@ -471,6 +514,11 @@ bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const {
   return impl_->Equals(*other.impl_);
 }
 
+const std::shared_ptr<const KeyValueMetadata>& ColumnChunkMetaData::key_value_metadata()
+    const {
+  return impl_->key_value_metadata();
+}
+
 // row-group metadata
 class RowGroupMetaData::RowGroupMetaDataImpl {
  public:
@@ -913,7 +961,7 @@ class FileMetaData::FileMetaDataImpl {
     std::vector<parquet::ColumnOrder> column_orders;
     if (metadata_->__isset.column_orders) {
       column_orders.reserve(metadata_->column_orders.size());
-      for (auto column_order : metadata_->column_orders) {
+      for (auto& column_order : metadata_->column_orders) {
         if (column_order.__isset.TYPE_ORDER) {
           column_orders.push_back(ColumnOrder::type_defined_);
         } else {
@@ -928,14 +976,7 @@ class FileMetaData::FileMetaDataImpl {
   }
 
   void InitKeyValueMetadata() {
-    std::shared_ptr<KeyValueMetadata> metadata = nullptr;
-    if (metadata_->__isset.key_value_metadata) {
-      metadata = std::make_shared<KeyValueMetadata>();
-      for (const auto& it : metadata_->key_value_metadata) {
-        metadata->Append(it.key, it.value);
-      }
-    }
-    key_value_metadata_ = std::move(metadata);
+    key_value_metadata_ = FromThriftKeyValueMetadata(*metadata_);
   }
 };
 
@@ -1590,6 +1631,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
     column_chunk_->meta_data.__set_encodings(std::move(thrift_encodings));
     column_chunk_->meta_data.__set_encoding_stats(std::move(thrift_encoding_stats));
 
+    if (key_value_metadata_) {
+      ToThriftKeyValueMetadata(*key_value_metadata_, &column_chunk_->meta_data);
+    }
+
     const auto& encrypt_md =
         properties_->column_encryption_properties(column_->path()->ToDotString());
     // column is encrypted
@@ -1656,6 +1701,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
     return column_chunk_->meta_data.total_compressed_size;
   }
 
+  void SetKeyValueMetadata(std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+    key_value_metadata_ = std::move(key_value_metadata);
+  }
+
  private:
   void Init(format::ColumnChunk* column_chunk) {
     column_chunk_ = column_chunk;
@@ -1670,6 +1719,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
   std::unique_ptr<format::ColumnChunk> owned_column_chunk_;
   const std::shared_ptr<WriterProperties> properties_;
   const ColumnDescriptor* column_;
+  std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
 };
 
 std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
@@ -1727,6 +1777,11 @@ void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result)
   impl_->SetStatistics(result);
 }
 
+void ColumnChunkMetaDataBuilder::SetKeyValueMetadata(
+    std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+  impl_->SetKeyValueMetadata(std::move(key_value_metadata));
+}
+
 int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const {
   return impl_->total_compressed_size();
 }
@@ -1925,16 +1980,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl {
       } else if (key_value_metadata) {
         key_value_metadata_ = key_value_metadata_->Merge(*key_value_metadata);
       }
-      metadata_->key_value_metadata.clear();
-      metadata_->key_value_metadata.reserve(
-          static_cast<size_t>(key_value_metadata_->size()));
-      for (int64_t i = 0; i < key_value_metadata_->size(); ++i) {
-        format::KeyValue kv_pair;
-        kv_pair.__set_key(key_value_metadata_->key(i));
-        kv_pair.__set_value(key_value_metadata_->value(i));
-        metadata_->key_value_metadata.push_back(std::move(kv_pair));
-      }
-      metadata_->__isset.key_value_metadata = true;
+      ToThriftKeyValueMetadata(*key_value_metadata_, metadata_.get());
     }
 
     int32_t file_version = 0;
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index e46297540ba..d1e2d1904a6 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -184,6 +184,7 @@ class PARQUET_EXPORT ColumnChunkMetaData {
   std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
   std::optional<IndexLocation> GetColumnIndexLocation() const;
   std::optional<IndexLocation> GetOffsetIndexLocation() const;
+  const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
 
  private:
   explicit ColumnChunkMetaData(
@@ -466,8 +467,12 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
   // column chunk
   // Used when a dataset is spread across multiple files
   void set_file_path(const std::string& path);
+
   // column metadata
   void SetStatistics(const EncodedStatistics& stats);
+
+  void SetKeyValueMetadata(std::shared_ptr<const KeyValueMetadata> key_value_metadata);
+
   // get the column descriptor
   const ColumnDescriptor* descr() const;
 
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 33df5925a1c..60adfc697f9 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -64,6 +64,25 @@ void PrintPageEncodingStats(std::ostream& stream,
 // the fixed initial size is just for an example
 #define COL_WIDTH 30
 
+void PutChars(std::ostream& stream, char c, int n) {
+  for (int i = 0; i < n; ++i) {
+    stream.put(c);
+  }
+}
+
+void PrintKeyValueMetadata(std::ostream& stream,
+                           const KeyValueMetadata& key_value_metadata,
+                           int indent_level = 0, int indent_width = 1) {
+  const int64_t size_of_key_value_metadata = key_value_metadata.size();
+  PutChars(stream, ' ', indent_level * indent_width);
+  stream << "Key Value Metadata: " << size_of_key_value_metadata << " entries\n";
+  for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
+    PutChars(stream, ' ', (indent_level + 1) * indent_width);
+    stream << "Key nr " << i << " " << key_value_metadata.key(i) << ": "
+           << key_value_metadata.value(i) << "\n";
+  }
+}
+
 void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
                                     bool print_values, bool format_dump,
                                     bool print_key_value_metadata, const char* filename) {
@@ -76,12 +95,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
 
   if (print_key_value_metadata && file_metadata->key_value_metadata()) {
     auto key_value_metadata = file_metadata->key_value_metadata();
-    int64_t size_of_key_value_metadata = key_value_metadata->size();
-    stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
-    for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
-      stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
-             << key_value_metadata->value(i) << "\n";
-    }
+    PrintKeyValueMetadata(stream, *key_value_metadata);
   }
 
   stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
@@ -136,7 +150,11 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
       std::shared_ptr<Statistics> stats = column_chunk->statistics();
 
       const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
-      stream << "Column " << i << std::endl << "  Values: " << column_chunk->num_values();
+      stream << "Column " << i << std::endl;
+      if (print_key_value_metadata && column_chunk->key_value_metadata()) {
+        PrintKeyValueMetadata(stream, *column_chunk->key_value_metadata(), 1, 2);
+      }
+      stream << "  Values: " << column_chunk->num_values();
       if (column_chunk->is_stats_set()) {
         std::string min = stats->EncodeMin(), max = stats->EncodeMax();
         stream << ", Null Values: " << stats->null_count()
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 35d15227ee5..d6aebd8284f 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -328,6 +328,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
         unique_ptr[CColumnCryptoMetaData] crypto_metadata() const
         optional[ParquetIndexLocation] GetColumnIndexLocation() const
         optional[ParquetIndexLocation] GetOffsetIndexLocation() const
+        shared_ptr[const CKeyValueMetadata] key_value_metadata() const
 
     struct CSortingColumn" parquet::SortingColumn":
         int column_idx
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 41b15b633d3..254bfe3b09a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -508,6 +508,19 @@ cdef class ColumnChunkMetaData(_Weakrefable):
         """Whether the column chunk has a column index"""
         return self.metadata.GetColumnIndexLocation().has_value()
 
+    @property
+    def metadata(self):
+        """Additional metadata as key value pairs (dict[bytes, bytes])."""
+        cdef:
+            unordered_map[c_string, c_string] metadata
+            const CKeyValueMetadata* underlying_metadata
+        underlying_metadata = self.metadata.key_value_metadata().get()
+        if underlying_metadata != NULL:
+            underlying_metadata.ToUnorderedMap(&metadata)
+            return metadata
+        else:
+            return None
+
 
 cdef class SortingColumn:
     """
diff --git a/python/pyarrow/tests/parquet/conftest.py b/python/pyarrow/tests/parquet/conftest.py
index 767e7f6b69d..80605e973cd 100644
--- a/python/pyarrow/tests/parquet/conftest.py
+++ b/python/pyarrow/tests/parquet/conftest.py
@@ -15,6 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import os
+import pathlib
+
 import pytest
 
 from pyarrow.util import guid
@@ -25,6 +28,15 @@ def datadir(base_datadir):
     return base_datadir / 'parquet'
 
 
+@pytest.fixture(scope='module')
+def parquet_test_datadir():
+    result = os.environ.get('PARQUET_TEST_DATA')
+    if not result:
+        raise RuntimeError('Please point the PARQUET_TEST_DATA environment '
+                           'variable to the test data directory')
+    return pathlib.Path(result)
+
+
 @pytest.fixture
 def s3_bucket(s3_server):
     boto3 = pytest.importorskip('boto3')
diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py
index 528cf0110dd..c29213ebc3d 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -782,3 +782,12 @@ def test_write_metadata_fs_file_combinations(tempdir, s3_example_s3fs):
     assert meta1.read_bytes() == meta2.read_bytes() \
         == meta3.read_bytes() == meta4.read_bytes() \
         == s3_fs.open(meta5).read()
+
+
+def test_column_chunk_key_value_metadata(parquet_test_datadir):
+    metadata = pq.read_metadata(parquet_test_datadir /
+                                'column_chunk_key_value_metadata.parquet')
+    key_value_metadata1 = metadata.row_group(0).column(0).metadata
+    assert key_value_metadata1 == {b'foo': b'bar', b'thisiskeywithoutvalue': b''}
+    key_value_metadata2 = metadata.row_group(0).column(1).metadata
+    assert key_value_metadata2 is None

From a50ad422cff112efb022d081e34344249ac83530 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Fri, 16 Aug 2024 02:06:08 +0200
Subject: [PATCH 021/157] MINOR: [CI] Fix ubuntu-lint to not install into
 system Python (#43710)

### Rationale for this change

Currently, the `ubuntu-lint` Docker build would install its Python dependencies directly into the system Python, which can fail depending on existing system Python packages.

See example here:
https://github.com/apache/arrow/actions/runs/10400929007/job/28802420047?pr=43539 where pip's dependency resolution fails with the following error message:
```
packaging.version.InvalidVersion: Invalid version: '2013-02-16'
```

### What changes are included in this PR?

This PR switches to use a virtual environment, guaranteeing that we're not interfering with the system Python and that we're not bound by already installed Python packages.

### Are these changes tested?

By CI.

### Are there any user-facing changes?

No.

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 docker-compose.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker-compose.yml b/docker-compose.yml
index daa5c74bcb9..14eeeeee6e5 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1889,6 +1889,9 @@ services:
     command: >
       /bin/bash -c "
         git config --global --add safe.directory /arrow &&
+        python3 -m venv /build/pyvenv &&
+        source /build/pyvenv/bin/activate &&
+        pip install -U pip setuptools &&
         pip install arrow/dev/archery[lint] &&
         archery lint --all --no-clang-tidy --no-iwyu --no-numpydoc --src /arrow"
 

From a970fd72b3debbaf4ef797025e06efa45ba588f8 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Fri, 16 Aug 2024 04:32:25 +0200
Subject: [PATCH 022/157] GH-43688: [C++] Prevent Snappy from disabling RTTI
 when bundled (#43706)

### Rationale for this change

Snappy's CMakeLists.txt unconditionally disables RTTI. This is incompatible with some other options, such as activating UBSAN for a fuzzing build:
https://github.com/google/snappy/issues/189

### What changes are included in this PR?

Add `-frtti` at the end of compiler options when compiling a bundled Snappy build.

### Are these changes tested?

On CI; also manually checked that this allows enabling Snappy on OSS-Fuzz builds.

### Are there any user-facing changes?

No.

* GitHub Issue: #43688

Lead-authored-by: Antoine Pitrou <pitrou@free.fr>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Co-authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 22 ++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 495aa704836..bc3a3a2249d 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -1355,16 +1355,24 @@ macro(build_snappy)
       "-DCMAKE_INSTALL_PREFIX=${SNAPPY_PREFIX}")
   # Snappy unconditionally enables -Werror when building with clang this can lead
   # to build failures by way of new compiler warnings. This adds a flag to disable
-  # Werror to the very end of the invocation to override the snappy internal setting.
+  # -Werror to the very end of the invocation to override the snappy internal setting.
+  set(SNAPPY_ADDITIONAL_CXX_FLAGS "")
   if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    foreach(CONFIG DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
-      list(APPEND
-           SNAPPY_CMAKE_ARGS
-           "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS_${CONFIG}} -Wno-error"
-      )
-    endforeach()
+    string(APPEND SNAPPY_ADDITIONAL_CXX_FLAGS " -Wno-error")
+  endif()
+  # Snappy unconditionally disables RTTI, which is incompatible with some other
+  # build settings (https://github.com/apache/arrow/issues/43688).
+  if(NOT MSVC)
+    string(APPEND SNAPPY_ADDITIONAL_CXX_FLAGS " -frtti")
   endif()
 
+  foreach(CONFIG DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
+    list(APPEND
+         SNAPPY_CMAKE_ARGS
+         "-DCMAKE_CXX_FLAGS_${CONFIG}=${EP_CXX_FLAGS_${CONFIG}} ${SNAPPY_ADDITIONAL_CXX_FLAGS}"
+    )
+  endforeach()
+
   if(APPLE AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
     # On macOS 10.13 we need to explicitly add <functional> to avoid a missing include error
     # This can be removed once CRAN no longer checks on macOS 10.13

From e9767c1a268f543536077cf80f49b097739f308c Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 16 Aug 2024 11:43:32 +0900
Subject: [PATCH 023/157] GH-41396: [Ruby] Add workaround for re2.pc on Ubuntu
 20.04 (#43721)

### Rationale for this change

Old re2.pc add "-std=c++11" but it causes a build error. Because Apache Arrow C++ requires C++17.

### What changes are included in this PR?

Remove "-std=c++11" as workaround. We can remove this workaround when we drop support for Ubuntu 20.04.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #41396

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ruby/red-arrow/ext/arrow/extconf.rb | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ruby/red-arrow/ext/arrow/extconf.rb b/ruby/red-arrow/ext/arrow/extconf.rb
index 7ef3c6c8343..28ccd0b2d59 100644
--- a/ruby/red-arrow/ext/arrow/extconf.rb
+++ b/ruby/red-arrow/ext/arrow/extconf.rb
@@ -66,6 +66,13 @@
   exit(false)
 end
 
+# Old re2.pc (e.g. re2.pc on Ubuntu 20.04) may add -std=c++11. It
+# causes a build error because Apache Arrow C++ requires C++17 or
+# later.
+#
+# We can remove this when we drop support for Ubuntu 20.04.
+$CXXFLAGS.gsub!("-std=c++11", "")
+
 [
   ["glib2", "ext/glib2"],
 ].each do |name, relative_source_dir|

From b80a51a65c8031bbd2d1d2e5645c541bd7076b5b Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 16 Aug 2024 14:23:05 +0900
Subject: [PATCH 024/157] GH-43594: [C++] Remove std::optional from
 arrow::ArrayStatistics::is_{min,max}_exact (#43595)

### Rationale for this change

We don't need "unknown" state. If they aren't set, we can process they are not exact.

### What changes are included in this PR?

Remove `std::optional` from `arrow::ArrayStatistics::is_{min,max}_exact`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #43594

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/src/arrow/array/statistics.h       |  8 ++++----
 cpp/src/arrow/array/statistics_test.cc | 14 ++++++--------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h
index 816d68e7776..523f877bbe4 100644
--- a/cpp/src/arrow/array/statistics.h
+++ b/cpp/src/arrow/array/statistics.h
@@ -43,14 +43,14 @@ struct ARROW_EXPORT ArrayStatistics {
   /// \brief The minimum value, may not be set
   std::optional<ValueType> min = std::nullopt;
 
-  /// \brief Whether the minimum value is exact or not, may not be set
-  std::optional<bool> is_min_exact = std::nullopt;
+  /// \brief Whether the minimum value is exact or not
+  bool is_min_exact = false;
 
   /// \brief The maximum value, may not be set
   std::optional<ValueType> max = std::nullopt;
 
-  /// \brief Whether the maximum value is exact or not, may not be set
-  std::optional<bool> is_max_exact = std::nullopt;
+  /// \brief Whether the maximum value is exact or not
+  bool is_max_exact = false;
 
   /// \brief Check two statistics for equality
   bool Equals(const ArrayStatistics& other) const {
diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc
index f4f4f500151..cf15a5d3829 100644
--- a/cpp/src/arrow/array/statistics_test.cc
+++ b/cpp/src/arrow/array/statistics_test.cc
@@ -40,27 +40,25 @@ TEST(ArrayStatisticsTest, TestDistinctCount) {
 TEST(ArrayStatisticsTest, TestMin) {
   ArrayStatistics statistics;
   ASSERT_FALSE(statistics.min.has_value());
-  ASSERT_FALSE(statistics.is_min_exact.has_value());
+  ASSERT_FALSE(statistics.is_min_exact);
   statistics.min = static_cast<uint64_t>(29);
   statistics.is_min_exact = true;
   ASSERT_TRUE(statistics.min.has_value());
   ASSERT_TRUE(std::holds_alternative<uint64_t>(statistics.min.value()));
   ASSERT_EQ(29, std::get<uint64_t>(statistics.min.value()));
-  ASSERT_TRUE(statistics.is_min_exact.has_value());
-  ASSERT_TRUE(statistics.is_min_exact.value());
+  ASSERT_TRUE(statistics.is_min_exact);
 }
 
 TEST(ArrayStatisticsTest, TestMax) {
   ArrayStatistics statistics;
   ASSERT_FALSE(statistics.max.has_value());
-  ASSERT_FALSE(statistics.is_max_exact.has_value());
+  ASSERT_FALSE(statistics.is_max_exact);
   statistics.max = std::string("hello");
   statistics.is_max_exact = false;
   ASSERT_TRUE(statistics.max.has_value());
   ASSERT_TRUE(std::holds_alternative<std::string>(statistics.max.value()));
   ASSERT_EQ("hello", std::get<std::string>(statistics.max.value()));
-  ASSERT_TRUE(statistics.is_max_exact.has_value());
-  ASSERT_FALSE(statistics.is_max_exact.value());
+  ASSERT_FALSE(statistics.is_max_exact);
 }
 
 TEST(ArrayStatisticsTest, TestEquality) {
@@ -84,9 +82,9 @@ TEST(ArrayStatisticsTest, TestEquality) {
   statistics2.min = std::string("world");
   ASSERT_EQ(statistics1, statistics2);
 
-  statistics1.is_min_exact = false;
+  statistics1.is_min_exact = true;
   ASSERT_NE(statistics1, statistics2);
-  statistics2.is_min_exact = false;
+  statistics2.is_min_exact = true;
   ASSERT_EQ(statistics1, statistics2);
 
   statistics1.max = static_cast<int64_t>(-29);

From bee2fc8021f3b5dabff0315fe20290f316a44ce4 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 16 Aug 2024 09:59:30 +0200
Subject: [PATCH 025/157] MINOR: [Docs][Python] Add LargeListType to Data Types
 docs (#43597)

### Rationale for this change

The `LargeListType` is missing in the Data Types docs: https://arrow.apache.org/docs/python/api/datatypes.html#type-classes

### What changes are included in this PR?

This PR adds the `LargeListType` to the Data Types docs.

### Are these changes tested?

The change only affects the docs. I have generated the docs locally and they appear as expected. See comment below with screenshot: https://github.com/apache/arrow/pull/43597#issuecomment-2273139016

### Are there any user-facing changes?

The change is indeed an update in the docs.

Authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 docs/source/python/api/datatypes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst
index a43c5299eae..86c29296873 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -96,6 +96,7 @@ functions above.
    DataType
    DictionaryType
    ListType
+   LargeListType
    MapType
    StructType
    UnionType

From d801daeddead7ceaca83424874ea006245430bc3 Mon Sep 17 00:00:00 2001
From: Xin Hao <haoxinst@gmail.com>
Date: Fri, 16 Aug 2024 18:16:49 +0800
Subject: [PATCH 026/157] MINOR: [Go][Doc] fix code format in the readme
 (#43725)

### Rationale for this change

### What changes are included in this PR?

### Are these changes tested?

### Are there any user-facing changes?

Authored-by: Xin Hao <haoxinst@gmail.com>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 go/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go/README.md b/go/README.md
index 51ac06c87f1..ec824229729 100644
--- a/go/README.md
+++ b/go/README.md
@@ -40,7 +40,7 @@ import (
 )
 
 func main() {
-	dsn := "uri=grpc://localhost:12345;username=mickeymouse;password=p@55w0RD"
+    dsn := "uri=grpc://localhost:12345;username=mickeymouse;password=p@55w0RD"
     db, err := sql.Open("flightsql", dsn)
     ...
 }

From 801301ee22ce802fd000f9f4b919abb47ae1d6c3 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Fri, 16 Aug 2024 14:40:56 -0700
Subject: [PATCH 027/157] GH-43633: [R] Add tests for packages that might be
 tricky to roundtrip data to Tables + Parquet files (#43634)

### Rationale for this change

Add coverage for objects that might have issues roundtripping to Arrow Tables or Parquet files

### What changes are included in this PR?

A new test file + a crossbow job that ensures these other packages are installed so the tests run.

### Are these changes tested?

The changes are tests

### Are there any user-facing changes?

No
* GitHub Issue: #43633

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 dev/tasks/r/github.linux.extra.packages.yml   |  53 +++++++++
 dev/tasks/tasks.yml                           |   4 +
 .../testthat/test-extra-package-roundtrip.R   | 105 ++++++++++++++++++
 3 files changed, 162 insertions(+)
 create mode 100644 dev/tasks/r/github.linux.extra.packages.yml
 create mode 100644 r/tests/testthat/test-extra-package-roundtrip.R

diff --git a/dev/tasks/r/github.linux.extra.packages.yml b/dev/tasks/r/github.linux.extra.packages.yml
new file mode 100644
index 00000000000..bb486c72a06
--- /dev/null
+++ b/dev/tasks/r/github.linux.extra.packages.yml
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+{% import 'macros.jinja' as macros with context %}
+
+{{ macros.github_header() }}
+
+jobs:
+  extra-packages:
+    name: "extra package roundtrip tests"
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    env:
+      ARROW_R_DEV: "FALSE"
+      ARROW_R_FORCE_EXTRA_PACKAGE_TESTS: TRUE
+    steps:
+      {{ macros.github_checkout_arrow()|indent }}
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+      - uses: r-lib/actions/setup-pandoc@v2
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          working-directory: 'arrow/r'
+          extra-packages: |
+            any::data.table
+            any::rcmdcheck
+            any::readr
+            any::units
+      - name: Build arrow package
+        run: |
+          R CMD build --no-build-vignettes arrow/r
+          R CMD INSTALL --install-tests --no-test-load --no-byte-compile arrow_*.tar.gz
+      - name: run tests
+        run: |
+          testthat::test_package("arrow", filter = "extra-package-roundtrip")
+        shell: Rscript {0}
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 6e1f7609a98..a9da7eb2889 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1309,6 +1309,10 @@ tasks:
     ci: github
     template: r/github.linux.rchk.yml
 
+  test-r-extra-packages:
+    ci: github
+    template: r/github.linux.extra.packages.yml   
+
   test-r-linux-as-cran:
     ci: github
     template: r/github.linux.cran.yml
diff --git a/r/tests/testthat/test-extra-package-roundtrip.R b/r/tests/testthat/test-extra-package-roundtrip.R
new file mode 100644
index 00000000000..09a87ef19d5
--- /dev/null
+++ b/r/tests/testthat/test-extra-package-roundtrip.R
@@ -0,0 +1,105 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_on_cran()
+
+# Any additional package that we test here that is not already in DESCRIPTION should be
+# added to dev/tasks/r/github.linux.extra.packages.yml in the r-lib/actions/setup-r-dependencies@v2
+# step so that they are installed + available in that CI job.
+
+# So that we can force these in CI
+load_or_skip <- function(pkg) {
+  if (identical(tolower(Sys.getenv("ARROW_R_FORCE_EXTRA_PACKAGE_TESTS")), "true")) {
+    # because of this indirection on the package name we also avoid a CHECK note and 
+    # we don't otherwise need to Suggest this
+    requireNamespace(pkg, quietly = TRUE)
+  } else {
+    skip_if(!requireNamespace(pkg, quietly = TRUE))
+  }
+  attachNamespace(pkg)
+}
+
+library(dplyr)
+
+test_that("readr read csvs roundtrip", {
+  load_or_skip("readr")
+
+  tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+
+  tf <- tempfile()
+  on.exit(unlink(tf))
+  write.csv(tbl, tf, row.names = FALSE)
+
+  # we should still be able to turn this into a table
+  new_df <- read_csv(tf, show_col_types = FALSE)
+  expect_equal(new_df, as_tibble(arrow_table(new_df)))    
+
+  # we should still be able to turn this into a table
+  new_df <- read_csv(tf, show_col_types = FALSE, lazy = TRUE)
+  expect_equal(new_df, as_tibble(arrow_table(new_df)))    
+
+  # and can roundtrip to a parquet file
+  pq_tmp_file <- tempfile()
+  write_parquet(new_df, pq_tmp_file)
+  new_df_read <- read_parquet(pq_tmp_file)
+
+  # we should still be able to turn this into a table
+  expect_equal(new_df, new_df_read)
+})
+
+test_that("data.table objects roundtrip", {
+  load_or_skip("data.table")
+
+  # https://github.com/Rdatatable/data.table/blob/83fd2c05ce2d8555ceb8ba417833956b1b574f7e/R/cedta.R#L25-L27
+  .datatable.aware=TRUE
+
+  DT <- as.data.table(example_data)
+
+  # Table -> collect which is what writing + reading to parquet uses under the hood to roundtrip
+  tab <- as_arrow_table(DT)
+  DT_read <- collect(tab)
+
+  # we should still be able to turn this into a table
+  # the .internal.selfref attribute is automatically ignored by testthat/waldo
+  expect_equal(DT, DT_read)
+
+  # and we can set keys + indices + create new columns
+  setkey(DT, chr)
+  setindex(DT, dbl)
+  DT[, dblshift := data.table::shift(dbl, 1)]
+
+  # Table -> collect
+  tab <- as_arrow_table(DT)
+  DT_read <- collect(tab)
+
+  # we should still be able to turn this into a table
+  expect_equal(DT, DT_read)
+})
+
+test_that("units roundtrip", {
+  load_or_skip("units")
+
+  tbl <- example_data
+  units(tbl$dbl) <- "s"
+
+   # Table -> collect which is what writing + reading to parquet uses under the hood to roundtrip
+  tab <- as_arrow_table(tbl)
+  tbl_read <- collect(tab)
+
+  # we should still be able to turn this into a table
+  expect_equal(tbl, tbl_read)
+})

From 8836535785ba3dd4ba335818a34e0479929b70e6 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 17 Aug 2024 11:20:16 +0900
Subject: [PATCH 028/157] GH-43702: [C++][FS][Azure] Use the latest Azurite and
 update the bundled Azure SDK for C++ to azure-identity_1.9.0 (#43723)

### Rationale for this change

Some our CI jobs (such as conda based jobs) use recent Azure SDK for C++ and they require latest Azurite. We need to update Azurite for these jobs.

I wanted to use the latest Azurite on all environments but I didn't. Because I want to keep using `apt install nodejs` on old Ubuntu for easy to maintain.

### What changes are included in this PR?

* Use the latest Azurite if possible
* Use `--skipApiVersionCheck` for old Azurite
* Update the bundled Azure SDK for C++
  * This is not required. It's for detecting this problem in many CI jobs.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.

* GitHub Issue: fix #41505
* GitHub Issue: #43702

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ci/scripts/install_azurite.sh            | 24 ++++++++++++++++++------
 cpp/src/arrow/filesystem/azurefs_test.cc |  5 ++++-
 cpp/thirdparty/versions.txt              |  4 ++--
 python/pyarrow/tests/conftest.py         |  3 +++
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/ci/scripts/install_azurite.sh b/ci/scripts/install_azurite.sh
index dda5e99405b..b8b1618bed3 100755
--- a/ci/scripts/install_azurite.sh
+++ b/ci/scripts/install_azurite.sh
@@ -19,20 +19,32 @@
 
 set -e
 
-# Pin azurite to 3.29.0 due to https://github.com/apache/arrow/issues/41505
+node_version="$(node --version)"
+echo "node version = ${node_version}"
+
+case "${node_version}" in
+  v12*)
+    # Pin azurite to 3.29.0 due to https://github.com/apache/arrow/issues/41505
+    azurite_version=v3.29.0
+    ;;
+  *)
+    azurite_version=latest
+    ;;
+esac
+
 case "$(uname)" in
   Darwin)
-    npm install -g azurite@v3.29.0
+    npm install -g azurite@${azurite_version}
     which azurite
     ;;
   MINGW*)
     choco install nodejs.install
-    npm install -g azurite@v3.29.0
+    npm install -g azurite@${azurite_version}
     ;;
   Linux)
-    npm install -g azurite@v3.29.0
+    npm install -g azurite@${azurite_version}
     which azurite
     ;;
 esac
-echo "node version = $(node --version)"
-echo "azurite version = $(azurite --version)"
\ No newline at end of file
+
+echo "azurite version = $(azurite --version)"
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc
index 36646f417cb..5ff241b17ff 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -198,7 +198,10 @@ class AzuriteEnv : public AzureEnvImpl<AzuriteEnv> {
                           self->temp_dir_->path().Join("debug.log"));
     auto server_process = bp::child(
         boost::this_process::environment(), exe_path, "--silent", "--location",
-        self->temp_dir_->path().ToString(), "--debug", self->debug_log_path_.ToString());
+        self->temp_dir_->path().ToString(), "--debug", self->debug_log_path_.ToString(),
+        // For old Azurite. We can't install the latest Azurite with
+        // old Node.js on old Ubuntu.
+        "--skipApiVersionCheck");
     if (!server_process.valid() || !server_process.running()) {
       server_process.terminate();
       server_process.wait();
diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt
index 16689c17fba..30fa24a2094 100644
--- a/cpp/thirdparty/versions.txt
+++ b/cpp/thirdparty/versions.txt
@@ -54,8 +54,8 @@ ARROW_AWS_LC_BUILD_SHA256_CHECKSUM=ae96a3567161552744fc0cae8b4d68ed88b1ec0f3d3c9
 ARROW_AWSSDK_BUILD_VERSION=1.10.55
 ARROW_AWSSDK_BUILD_SHA256_CHECKSUM=2d552fb1a84bef4a9b65e34aa7031851ed2aef5319e02cc6e4cb735c48aa30de
 # Despite the confusing version name this is still the whole Azure SDK for C++ including core, keyvault, storage-common, etc.
-ARROW_AZURE_SDK_BUILD_VERSION=azure-core_1.10.3
-ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM=dd624c2f86adf474d2d0a23066be6e27af9cbd7e3f8d9d8fd7bf981e884b7b48
+ARROW_AZURE_SDK_BUILD_VERSION=azure-identity_1.9.0
+ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM=97065bfc971ac8df450853ce805f820f52b59457bd7556510186a1569502e4a1
 ARROW_BOOST_BUILD_VERSION=1.81.0
 ARROW_BOOST_BUILD_SHA256_CHECKSUM=9e0ffae35528c35f90468997bc8d99500bf179cbae355415a89a600c38e13574
 ARROW_BROTLI_BUILD_VERSION=v1.0.9
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 343b602995d..e1919497b51 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -263,6 +263,9 @@ def azure_server(tmpdir_factory):
     tmpdir = tmpdir_factory.getbasetemp()
     # We only need blob service emulator, not queue or table.
     args = ['azurite-blob', "--location", tmpdir, "--blobPort", str(port)]
+    # For old Azurite. We can't install the latest Azurite with old
+    # Node.js on old Ubuntu.
+    args += ["--skipApiVersionCheck"]
     proc = None
     try:
         proc = subprocess.Popen(args, env=env)

From 49be60f5c424cca40bbc5a6d1948ad7e800afaab Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 17 Aug 2024 11:50:46 +0900
Subject: [PATCH 029/157] GH-43175: [C++] Skip not Emscripten ready tests in
 CSV tests (#43724)

### Rationale for this change

We can't use thread nor `%z` on Emacripten. Some CSV tests use them.

### What changes are included in this PR?

Skip CSV tests that use thread or `%z`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #43175

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ci/scripts/cpp_test.sh                   |  2 +-
 cpp/src/arrow/csv/column_decoder_test.cc | 11 +++++++++++
 cpp/src/arrow/csv/converter_test.cc      |  5 +++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh
index 2c640f2c1fb..7912bf23e49 100755
--- a/ci/scripts/cpp_test.sh
+++ b/ci/scripts/cpp_test.sh
@@ -80,7 +80,7 @@ case "$(uname)" in
     ;;
 esac
 
-if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then  
+if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then
   n_jobs=1 # avoid spurious fails on emscripten due to loading too many big executables
 fi
 
diff --git a/cpp/src/arrow/csv/column_decoder_test.cc b/cpp/src/arrow/csv/column_decoder_test.cc
index ebac7a3da2f..56773264717 100644
--- a/cpp/src/arrow/csv/column_decoder_test.cc
+++ b/cpp/src/arrow/csv/column_decoder_test.cc
@@ -175,6 +175,9 @@ class NullColumnDecoderTest : public ColumnDecoderTest {
   }
 
   void TestThreaded() {
+#ifndef ARROW_ENABLE_THREADING
+    GTEST_SKIP() << "Test requires threading support";
+#endif
     constexpr int NITERS = 10;
     auto type = int32();
     MakeDecoder(type);
@@ -257,6 +260,10 @@ class TypedColumnDecoderTest : public ColumnDecoderTest {
   }
 
   void TestThreaded() {
+#ifndef ARROW_ENABLE_THREADING
+    GTEST_SKIP() << "Test requires threading support";
+#endif
+
     constexpr int NITERS = 10;
     auto type = uint32();
     MakeDecoder(type, default_options);
@@ -305,6 +312,10 @@ class InferringColumnDecoderTest : public ColumnDecoderTest {
   }
 
   void TestThreaded() {
+#ifndef ARROW_ENABLE_THREADING
+    GTEST_SKIP() << "Test requires threading support";
+#endif
+
     constexpr int NITERS = 10;
     auto type = float64();
     MakeDecoder(default_options);
diff --git a/cpp/src/arrow/csv/converter_test.cc b/cpp/src/arrow/csv/converter_test.cc
index ea4e171d57e..657e8d813ca 100644
--- a/cpp/src/arrow/csv/converter_test.cc
+++ b/cpp/src/arrow/csv/converter_test.cc
@@ -625,6 +625,11 @@ TEST(TimestampConversion, UserDefinedParsers) {
 }
 
 TEST(TimestampConversion, UserDefinedParsersWithZone) {
+#ifdef __EMSCRIPTEN__
+  GTEST_SKIP() << "Test temporarily disabled due to emscripten bug "
+                  "https://github.com/emscripten-core/emscripten/issues/20467";
+#endif
+
   auto options = ConvertOptions::Defaults();
   auto type = timestamp(TimeUnit::SECOND, "America/Phoenix");
 

From fbac12c353cb6ead58a5ee765b37bd1bc46cd672 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Sat, 17 Aug 2024 17:16:39 -0500
Subject: [PATCH 030/157] MINOR: [R] Fix a package namespace warning (#43737)

Oops, I should have caught this in #43633 Removes `data.table::` since the namespace is loaded. Also fix some linting errors and free up space on the force tests run.

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 .github/workflows/r.yml                         |  3 +++
 r/tests/testthat/test-extra-package-roundtrip.R | 16 ++++++++--------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index c4899ddcc49..bf7eb99e7e9 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -133,6 +133,9 @@ jobs:
         with:
           fetch-depth: 0
           submodules: recursive
+      - name: Free up disk space
+        run: |
+          ci/scripts/util_free_space.sh
       - name: Cache Docker Volumes
         uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0
         with:
diff --git a/r/tests/testthat/test-extra-package-roundtrip.R b/r/tests/testthat/test-extra-package-roundtrip.R
index 09a87ef19d5..092288dffb9 100644
--- a/r/tests/testthat/test-extra-package-roundtrip.R
+++ b/r/tests/testthat/test-extra-package-roundtrip.R
@@ -24,7 +24,7 @@ skip_on_cran()
 # So that we can force these in CI
 load_or_skip <- function(pkg) {
   if (identical(tolower(Sys.getenv("ARROW_R_FORCE_EXTRA_PACKAGE_TESTS")), "true")) {
-    # because of this indirection on the package name we also avoid a CHECK note and 
+    # because of this indirection on the package name we also avoid a CHECK note and
     # we don't otherwise need to Suggest this
     requireNamespace(pkg, quietly = TRUE)
   } else {
@@ -46,11 +46,11 @@ test_that("readr read csvs roundtrip", {
 
   # we should still be able to turn this into a table
   new_df <- read_csv(tf, show_col_types = FALSE)
-  expect_equal(new_df, as_tibble(arrow_table(new_df)))    
+  expect_equal(new_df, as_tibble(arrow_table(new_df)))
 
   # we should still be able to turn this into a table
   new_df <- read_csv(tf, show_col_types = FALSE, lazy = TRUE)
-  expect_equal(new_df, as_tibble(arrow_table(new_df)))    
+  expect_equal(new_df, as_tibble(arrow_table(new_df)))
 
   # and can roundtrip to a parquet file
   pq_tmp_file <- tempfile()
@@ -65,11 +65,11 @@ test_that("data.table objects roundtrip", {
   load_or_skip("data.table")
 
   # https://github.com/Rdatatable/data.table/blob/83fd2c05ce2d8555ceb8ba417833956b1b574f7e/R/cedta.R#L25-L27
-  .datatable.aware=TRUE
+  .datatable.aware <- TRUE
 
   DT <- as.data.table(example_data)
 
-  # Table -> collect which is what writing + reading to parquet uses under the hood to roundtrip
+  # Table to collect which is what writing + reading to parquet uses under the hood to roundtrip
   tab <- as_arrow_table(DT)
   DT_read <- collect(tab)
 
@@ -80,9 +80,9 @@ test_that("data.table objects roundtrip", {
   # and we can set keys + indices + create new columns
   setkey(DT, chr)
   setindex(DT, dbl)
-  DT[, dblshift := data.table::shift(dbl, 1)]
+  DT[, dblshift := shift(dbl, 1)]
 
-  # Table -> collect
+  # Table to collect
   tab <- as_arrow_table(DT)
   DT_read <- collect(tab)
 
@@ -96,7 +96,7 @@ test_that("units roundtrip", {
   tbl <- example_data
   units(tbl$dbl) <- "s"
 
-   # Table -> collect which is what writing + reading to parquet uses under the hood to roundtrip
+  # Table to collect which is what writing + reading to parquet uses under the hood to roundtrip
   tab <- as_arrow_table(tbl)
   tbl_read <- collect(tab)
 

From b7e618f088540a45e2ddab39696ce3d543821763 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sun, 18 Aug 2024 10:42:53 +0900
Subject: [PATCH 031/157] GH-43738: [GLib] Add `GArrowAzureFileSytem` (#43739)

### Rationale for this change

The bindings for `arrow::fs::AzureFileSytem` is missing.

### What changes are included in this PR?

Add the bindings for `arrow::fs::AzureFileSytem`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #43738

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/arrow-glib/file-system.cpp | 16 ++++++++++++++++
 c_glib/arrow-glib/file-system.h   | 12 ++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/c_glib/arrow-glib/file-system.cpp b/c_glib/arrow-glib/file-system.cpp
index b6efa2b8726..9ba494e4059 100644
--- a/c_glib/arrow-glib/file-system.cpp
+++ b/c_glib/arrow-glib/file-system.cpp
@@ -56,6 +56,8 @@ G_BEGIN_DECLS
  * #GArrowS3FileSystem is a class for S3-backed file system.
  *
  * #GArrowGCSFileSystem is a class for GCS-backed file system.
+ *
+ * #GArrowAzureFileSystem is a class for Azure-backed file system.
  */
 
 /* arrow::fs::FileInfo */
@@ -1561,6 +1563,18 @@ garrow_gcs_file_system_class_init(GArrowGCSFileSystemClass *klass)
 {
 }
 
+G_DEFINE_TYPE(GArrowAzureFileSystem, garrow_azure_file_system, GARROW_TYPE_FILE_SYSTEM)
+
+static void
+garrow_azure_file_system_init(GArrowAzureFileSystem *file_system)
+{
+}
+
+static void
+garrow_azure_file_system_class_init(GArrowAzureFileSystemClass *klass)
+{
+}
+
 G_END_DECLS
 
 GArrowFileInfo *
@@ -1592,6 +1606,8 @@ garrow_file_system_new_raw(std::shared_ptr<arrow::fs::FileSystem> *arrow_file_sy
     file_system_type = GARROW_TYPE_S3_FILE_SYSTEM;
   } else if (type_name == "gcs") {
     file_system_type = GARROW_TYPE_GCS_FILE_SYSTEM;
+  } else if (type_name == "abfs") {
+    file_system_type = GARROW_TYPE_AZURE_FILE_SYSTEM;
   } else if (type_name == "mock") {
     file_system_type = GARROW_TYPE_MOCK_FILE_SYSTEM;
   }
diff --git a/c_glib/arrow-glib/file-system.h b/c_glib/arrow-glib/file-system.h
index 2e500672e14..9a903c6af68 100644
--- a/c_glib/arrow-glib/file-system.h
+++ b/c_glib/arrow-glib/file-system.h
@@ -337,4 +337,16 @@ struct _GArrowGCSFileSystemClass
   GArrowFileSystemClass parent_class;
 };
 
+#define GARROW_TYPE_AZURE_FILE_SYSTEM (garrow_azure_file_system_get_type())
+GARROW_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(GArrowAzureFileSystem,
+                         garrow_azure_file_system,
+                         GARROW,
+                         AZURE_FILE_SYSTEM,
+                         GArrowFileSystem)
+struct _GArrowAzureFileSystemClass
+{
+  GArrowFileSystemClass parent_class;
+};
+
 G_END_DECLS

From 5ef7e01053c526389acefddd6f961bf1fd9d274b Mon Sep 17 00:00:00 2001
From: Jin Chengcheng <chengcheng.jin@intel.com>
Date: Sun, 18 Aug 2024 15:28:52 +0800
Subject: [PATCH 032/157] GH-43506: [Java] Fix TestFragmentScanOptions result
 not match (#43639)

### Rationale for this change
JNI test was not tested in CI. So the test failed but passed the CI.
The parseChar function should return char but return bool, a typo error.

### What changes are included in this PR?

### Are these changes tested?
Yes

### Are there any user-facing changes?
No

* GitHub Issue: #43506

Authored-by: Chengcheng Jin <chengcheng.jin@intel.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/dataset/src/main/cpp/jni_wrapper.cc      |  2 +-
 .../dataset/TestFragmentScanOptions.java      | 80 ++++++++++++-------
 2 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc
index 63b8dd73f47..49cc85251c8 100644
--- a/java/dataset/src/main/cpp/jni_wrapper.cc
+++ b/java/dataset/src/main/cpp/jni_wrapper.cc
@@ -368,7 +368,7 @@ std::shared_ptr<arrow::Buffer> LoadArrowBufferFromByteBuffer(JNIEnv* env, jobjec
 
 inline bool ParseBool(const std::string& value) { return value == "true" ? true : false; }
 
-inline bool ParseChar(const std::string& key, const std::string& value) {
+inline char ParseChar(const std::string& key, const std::string& value) {
   if (value.size() != 1) {
     JniThrow("Option " + key + " should be a char, but is " + value);
   }
diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java
index d5981905288..ed6344f0f9c 100644
--- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java
+++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java
@@ -51,6 +51,16 @@
 
 public class TestFragmentScanOptions {
 
+  private CsvFragmentScanOptions create(
+      ArrowSchema cSchema,
+      Map<String, String> convertOptionsMap,
+      Map<String, String> readOptions,
+      Map<String, String> parseOptions) {
+    CsvConvertOptions convertOptions = new CsvConvertOptions(convertOptionsMap);
+    convertOptions.setArrowSchema(cSchema);
+    return new CsvFragmentScanOptions(convertOptions, readOptions, parseOptions);
+  }
+
   @Test
   public void testCsvConvertOptions() throws Exception {
     final Schema schema =
@@ -63,24 +73,29 @@ public void testCsvConvertOptions() throws Exception {
     String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv";
     BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
     try (ArrowSchema cSchema = ArrowSchema.allocateNew(allocator);
+        ArrowSchema cSchema2 = ArrowSchema.allocateNew(allocator);
         CDataDictionaryProvider provider = new CDataDictionaryProvider()) {
       Data.exportSchema(allocator, schema, provider, cSchema);
-      CsvConvertOptions convertOptions = new CsvConvertOptions(ImmutableMap.of("delimiter", ";"));
-      convertOptions.setArrowSchema(cSchema);
-      CsvFragmentScanOptions fragmentScanOptions =
-          new CsvFragmentScanOptions(convertOptions, ImmutableMap.of(), ImmutableMap.of());
+      Data.exportSchema(allocator, schema, provider, cSchema2);
+      CsvFragmentScanOptions fragmentScanOptions1 =
+          create(cSchema, ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of("delimiter", ";"));
+      CsvFragmentScanOptions fragmentScanOptions2 =
+          create(cSchema2, ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of("delimiter", ";"));
       ScanOptions options =
           new ScanOptions.Builder(/*batchSize*/ 32768)
               .columns(Optional.empty())
-              .fragmentScanOptions(fragmentScanOptions)
+              .fragmentScanOptions(fragmentScanOptions1)
               .build();
       try (DatasetFactory datasetFactory =
               new FileSystemDatasetFactory(
-                  allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path);
+                  allocator,
+                  NativeMemoryPool.getDefault(),
+                  FileFormat.CSV,
+                  path,
+                  Optional.of(fragmentScanOptions2));
           Dataset dataset = datasetFactory.finish();
           Scanner scanner = dataset.newScan(options);
           ArrowReader reader = scanner.scanBatches()) {
-
         assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields());
         int rowCount = 0;
         while (reader.loadNextBatch()) {
@@ -106,30 +121,38 @@ public void testCsvConvertOptionsDelimiterNotSet() throws Exception {
     String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv";
     BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
     try (ArrowSchema cSchema = ArrowSchema.allocateNew(allocator);
+        ArrowSchema cSchema2 = ArrowSchema.allocateNew(allocator);
         CDataDictionaryProvider provider = new CDataDictionaryProvider()) {
       Data.exportSchema(allocator, schema, provider, cSchema);
-      CsvConvertOptions convertOptions = new CsvConvertOptions(ImmutableMap.of());
-      convertOptions.setArrowSchema(cSchema);
-      CsvFragmentScanOptions fragmentScanOptions =
-          new CsvFragmentScanOptions(convertOptions, ImmutableMap.of(), ImmutableMap.of());
+      Data.exportSchema(allocator, schema, provider, cSchema2);
+      CsvFragmentScanOptions fragmentScanOptions1 =
+          create(cSchema, ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of());
+      CsvFragmentScanOptions fragmentScanOptions2 =
+          create(cSchema2, ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of());
       ScanOptions options =
           new ScanOptions.Builder(/*batchSize*/ 32768)
               .columns(Optional.empty())
-              .fragmentScanOptions(fragmentScanOptions)
+              .fragmentScanOptions(fragmentScanOptions1)
               .build();
       try (DatasetFactory datasetFactory =
               new FileSystemDatasetFactory(
-                  allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path);
+                  allocator,
+                  NativeMemoryPool.getDefault(),
+                  FileFormat.CSV,
+                  path,
+                  Optional.of(fragmentScanOptions2));
           Dataset dataset = datasetFactory.finish();
           Scanner scanner = dataset.newScan(options);
           ArrowReader reader = scanner.scanBatches()) {
-
-        assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields());
         int rowCount = 0;
         while (reader.loadNextBatch()) {
-          final ValueIterableVector<Integer> idVector =
-              (ValueIterableVector<Integer>) reader.getVectorSchemaRoot().getVector("Id");
-          assertThat(idVector.getValueIterable(), IsIterableContainingInOrder.contains(1, 2, 3));
+          final ValueIterableVector<Text> idVector =
+              (ValueIterableVector<Text>)
+                  reader.getVectorSchemaRoot().getVector("Id;Name;Language");
+          assertThat(
+              idVector.getValueIterable(),
+              IsIterableContainingInOrder.contains(
+                  new Text("1;Juno;Java"), new Text("2;Peter;Python"), new Text("3;Celin;C++")));
           rowCount += reader.getVectorSchemaRoot().getRowCount();
         }
         assertEquals(3, rowCount);
@@ -157,13 +180,12 @@ public void testCsvConvertOptionsNoOption() throws Exception {
       assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields());
       int rowCount = 0;
       while (reader.loadNextBatch()) {
-        final ValueIterableVector<String> idVector =
-            (ValueIterableVector<String>)
-                reader.getVectorSchemaRoot().getVector("Id;Name;Language");
+        final ValueIterableVector<Text> idVector =
+            (ValueIterableVector<Text>) reader.getVectorSchemaRoot().getVector("Id;Name;Language");
         assertThat(
             idVector.getValueIterable(),
             IsIterableContainingInOrder.contains(
-                "1;Juno;Java\n" + "2;Peter;Python\n" + "3;Celin;C++"));
+                new Text("1;Juno;Java"), new Text("2;Peter;Python"), new Text("3;Celin;C++")));
         rowCount += reader.getVectorSchemaRoot().getRowCount();
       }
       assertEquals(3, rowCount);
@@ -174,7 +196,10 @@ public void testCsvConvertOptionsNoOption() throws Exception {
   public void testCsvReadParseAndReadOptions() throws Exception {
     final Schema schema =
         new Schema(
-            Collections.singletonList(Field.nullable("Id;Name;Language", new ArrowType.Utf8())),
+            Arrays.asList(
+                Field.nullable("Id", new ArrowType.Int(64, true)),
+                Field.nullable("Name", new ArrowType.Utf8()),
+                Field.nullable("Language", new ArrowType.Utf8())),
             null);
     String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv";
     BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
@@ -202,12 +227,9 @@ public void testCsvReadParseAndReadOptions() throws Exception {
       assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields());
       int rowCount = 0;
       while (reader.loadNextBatch()) {
-        final ValueIterableVector<Text> idVector =
-            (ValueIterableVector<Text>) reader.getVectorSchemaRoot().getVector("Id;Name;Language");
-        assertThat(
-            idVector.getValueIterable(),
-            IsIterableContainingInOrder.contains(
-                new Text("2;Peter;Python"), new Text("3;Celin;C++")));
+        final ValueIterableVector<Long> idVector =
+            (ValueIterableVector<Long>) reader.getVectorSchemaRoot().getVector("Id");
+        assertThat(idVector.getValueIterable(), IsIterableContainingInOrder.contains(2L, 3L));
         rowCount += reader.getVectorSchemaRoot().getRowCount();
       }
       assertEquals(2, rowCount);

From 1ae38d0d42c1ae5800e42b613f22593673b7370c Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Sun, 18 Aug 2024 08:48:55 -0500
Subject: [PATCH 033/157] GH-43735: [R] AWS SDK fails to build on one of CRAN's
 M1 builders (#43736)

Trying to replicate the issue's on CRAN's M1 machine so that we can fix them.
* GitHub Issue: #43735

Lead-authored-by: Jonathan Keane <jkeane@gmail.com>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 12 +++
 dev/tasks/r/github.macos.cran.yml           | 82 +++++++++++++++++++++
 dev/tasks/tasks.yml                         |  4 +
 3 files changed, 98 insertions(+)
 create mode 100644 dev/tasks/r/github.macos.cran.yml

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index bc3a3a2249d..63e2c036c9a 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -4965,8 +4965,20 @@ macro(build_awssdk)
   set(AWSSDK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/awssdk_ep-install")
   set(AWSSDK_INCLUDE_DIR "${AWSSDK_PREFIX}/include")
 
+  # The AWS SDK has a few warnings around shortening lengths
+  set(AWS_C_FLAGS "${EP_C_FLAGS}")
+  set(AWS_CXX_FLAGS "${EP_CXX_FLAGS}")
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL
+                                                    "Clang")
+    # Negate warnings that AWS SDK cannot build under
+    string(APPEND AWS_C_FLAGS " -Wno-error=shorten-64-to-32")
+    string(APPEND AWS_CXX_FLAGS " -Wno-error=shorten-64-to-32")
+  endif()
+
   set(AWSSDK_COMMON_CMAKE_ARGS
       ${EP_COMMON_CMAKE_ARGS}
+      -DCMAKE_C_FLAGS=${AWS_C_FLAGS}
+      -DCMAKE_CXX_FLAGS=${AWS_CXX_FLAGS}
       -DCPP_STANDARD=${CMAKE_CXX_STANDARD}
       -DCMAKE_INSTALL_PREFIX=${AWSSDK_PREFIX}
       -DCMAKE_PREFIX_PATH=${AWSSDK_PREFIX}
diff --git a/dev/tasks/r/github.macos.cran.yml b/dev/tasks/r/github.macos.cran.yml
new file mode 100644
index 00000000000..33965988e21
--- /dev/null
+++ b/dev/tasks/r/github.macos.cran.yml
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+{% import 'macros.jinja' as macros with context %}
+
+{{ macros.github_header() }}
+
+jobs:
+  macos-cran:
+    name: "macOS similar to CRAN"
+    runs-on: macOS-latest
+    strategy:
+      fail-fast: false
+
+    steps:
+      {{ macros.github_checkout_arrow()|indent }}
+
+      - name: Configure dependencies (macos)
+        run: |
+          brew install openssl
+          # disable sccache on macos as it times out for unknown reasons
+          # see GH-33721
+          # brew install sccache
+          # remove cmake so that we can test our cmake downloading abilities
+          brew uninstall cmake
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+      # CRAN builders have the entire bin here added to the path. This sometimes
+      # includes things like GNU libtool which name-collide with what we expect
+      - name: Add R.framework/Resources/bin to the path
+        run: echo "/Library/Frameworks/R.framework/Resources/bin" >> $GITHUB_PATH
+      - name : Check whether libtool in R is used
+        run: |
+          if [ "$(which libtool)" != "/Library/Frameworks/R.framework/Resources/bin/libtool" ]; then
+            echo "libtool provided by R isn't found: $(which libtool)"
+            exit 1
+          fi
+      - name: Install dependencies
+        uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          cache: false # cache does not work on across branches
+          working-directory: arrow/r
+          extra-packages: |
+            any::rcmdcheck
+            any::sys
+      - name: Install
+        env:
+          _R_CHECK_CRAN_INCOMING_: false
+          CXX: "clang++ -mmacos-version-min=14.6"
+          CFLAGS: "-falign-functions=8 -g -O2 -Wall -pedantic -Wconversion -Wno-sign-conversion -Wstrict-prototypes"
+          CXXFLAGS: "-g -O2 -Wall -pedantic -Wconversion -Wno-sign-conversion"
+          NOT_CRAN: false
+        run: |
+          sccache --start-server || echo 'sccache not found'
+          cd arrow/r
+          R CMD INSTALL . --install-tests
+      - name: Run the tests
+        run: R -e 'if(tools::testInstalledPackage("arrow") != 0L) stop("There was a test failure.")'
+      - name: Dump test logs
+        run: cat arrow-tests/testthat.Rout*
+        if: failure()
+      - name: Save the test output
+        uses: actions/upload-artifact@v2
+        with:
+          name: test-output
+          path: arrow-tests/testthat.Rout*
+        if: always()
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index a9da7eb2889..fe02fe9ce68 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1319,6 +1319,10 @@ tasks:
     params:
       MATRIX: {{ "${{ matrix.r_image }}" }}
 
+  test-r-macos-as-cran:
+    ci: github
+    template: r/github.macos.cran.yml
+
   test-r-arrow-backwards-compatibility:
     ci: github
     template: r/github.linux.arrow.version.back.compat.yml

From 5e68513d62b0d216e916de6a1ad2db04f5d1a7bf Mon Sep 17 00:00:00 2001
From: Rossi Sun <zanmato1984@gmail.com>
Date: Mon, 19 Aug 2024 18:39:05 +0800
Subject: [PATCH 034/157] GH-43495: [C++][Compute] Widen the row offset of the
 row table to 64-bit (#43389)

### Rationale for this change

The row table uses `uint32_t` as the row offset within the row data buffer, effectively limiting the row data from growing beyond 4GB. This is quite restrictive, and the impact is described in more detail in #43495. This PR proposes to widen the row offset from 32-bit to 64-bit to address this limitation.

#### Benefits
Currently, the row table has three major limitations:
1. The overall data size cannot exceed 4GB.
2. The size of a single row cannot exceed 4GB.
3. The number of rows cannot exceed 2^32.

This enhancement will eliminate the first limitation. Meanwhile, the second and third limitations are less likely to occur. Thus, this change will enable a significant range of use cases that are currently unsupported.

#### Overhead
Of course, this will introduce some overhead:
1. An extra 4 bytes of memory consumption for each row due to the offset size difference from 32-bit to 64-bit.
2. A wider offset type requires a few more SIMD instructions in each 8-row processing iteration.

In my opinion, this overhead is justified by the benefits listed above.

### What changes are included in this PR?

Change the row offset of the row table from 32-bit to 64-bit. Relative code in row comparison/encoding and swiss join has been updated accordingly.

### Are these changes tested?

Test included.

### Are there any user-facing changes?

Users could potentially see higher memory consumption when using acero's hash join and hash aggregation. However, on the other hand, certain use cases used to fail are now able to complete.

* GitHub Issue: #43495

Authored-by: Ruoxi Sun <zanmato1984@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/acero/hash_join_node_test.cc    | 192 ++++++++++
 cpp/src/arrow/acero/swiss_join.cc             |  26 +-
 cpp/src/arrow/acero/swiss_join_avx2.cc        | 126 +++++--
 cpp/src/arrow/compute/row/compare_internal.cc |  39 +-
 cpp/src/arrow/compute/row/compare_internal.h  |  27 +-
 .../compute/row/compare_internal_avx2.cc      | 172 ++++-----
 cpp/src/arrow/compute/row/compare_test.cc     | 333 +++++++++++++-----
 cpp/src/arrow/compute/row/encode_internal.cc  |  47 ++-
 cpp/src/arrow/compute/row/encode_internal.h   |   7 +-
 .../arrow/compute/row/encode_internal_avx2.cc |  10 +-
 cpp/src/arrow/compute/row/row_internal.cc     |  38 +-
 cpp/src/arrow/compute/row/row_internal.h      |  37 +-
 cpp/src/arrow/compute/row/row_test.cc         |  66 ++--
 cpp/src/arrow/testing/random.cc               |  19 +-
 cpp/src/arrow/testing/random.h                |   6 +
 15 files changed, 802 insertions(+), 343 deletions(-)

diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc
index f7b442cc3c6..88f9a9e71b7 100644
--- a/cpp/src/arrow/acero/hash_join_node_test.cc
+++ b/cpp/src/arrow/acero/hash_join_node_test.cc
@@ -30,6 +30,7 @@
 #include "arrow/compute/kernels/test_util.h"
 #include "arrow/compute/light_array_internal.h"
 #include "arrow/testing/extension_type.h"
+#include "arrow/testing/generator.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/matchers.h"
 #include "arrow/testing/random.h"
@@ -40,6 +41,10 @@ using testing::UnorderedElementsAreArray;
 
 namespace arrow {
 
+using arrow::gen::Constant;
+using arrow::random::kSeedMax;
+using arrow::random::RandomArrayGenerator;
+using compute::and_;
 using compute::call;
 using compute::default_exec_context;
 using compute::ExecBatchBuilder;
@@ -3253,5 +3258,192 @@ TEST(HashJoin, ManyJoins) {
   ASSERT_OK_AND_ASSIGN(std::ignore, DeclarationToTable(std::move(root)));
 }
 
+namespace {
+
+void AssertRowCountEq(Declaration source, int64_t expected) {
+  Declaration count{"aggregate",
+                    {std::move(source)},
+                    AggregateNodeOptions{/*aggregates=*/{{"count_all", "count(*)"}}}};
+  ASSERT_OK_AND_ASSIGN(auto batches, DeclarationToExecBatches(std::move(count)));
+  ASSERT_EQ(batches.batches.size(), 1);
+  ASSERT_EQ(batches.batches[0].values.size(), 1);
+  ASSERT_TRUE(batches.batches[0].values[0].is_scalar());
+  ASSERT_EQ(batches.batches[0].values[0].scalar()->type->id(), Type::INT64);
+  ASSERT_TRUE(batches.batches[0].values[0].scalar_as<Int64Scalar>().is_valid);
+  ASSERT_EQ(batches.batches[0].values[0].scalar_as<Int64Scalar>().value, expected);
+}
+
+}  // namespace
+
+// GH-43495: Test that both the key and the payload of the right side (the build side) are
+// fixed length and larger than 4GB, and the 64-bit offset in the hash table can handle it
+// correctly.
+TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBFixedLength)) {
+  constexpr int64_t k5GB = 5ll * 1024 * 1024 * 1024;
+  constexpr int fixed_length = 128;
+  const auto type = fixed_size_binary(fixed_length);
+  constexpr uint8_t byte_no_match_min = static_cast<uint8_t>('A');
+  constexpr uint8_t byte_no_match_max = static_cast<uint8_t>('y');
+  constexpr uint8_t byte_match = static_cast<uint8_t>('z');
+  const auto value_match =
+      std::make_shared<FixedSizeBinaryScalar>(std::string(fixed_length, byte_match));
+  constexpr int16_t num_rows_per_batch_left = 128;
+  constexpr int16_t num_rows_per_batch_right = 4096;
+  const int64_t num_batches_left = 8;
+  const int64_t num_batches_right =
+      k5GB / (num_rows_per_batch_right * type->byte_width());
+
+  // Left side composed of num_batches_left identical batches of num_rows_per_batch_left
+  // rows of value_match-es.
+  BatchesWithSchema batches_left;
+  {
+    // A column with num_rows_per_batch_left value_match-es.
+    ASSERT_OK_AND_ASSIGN(auto column,
+                         Constant(value_match)->Generate(num_rows_per_batch_left));
+
+    // Use the column as both the key and the payload.
+    ExecBatch batch({column, column}, num_rows_per_batch_left);
+    batches_left =
+        BatchesWithSchema{std::vector<ExecBatch>(num_batches_left, std::move(batch)),
+                          schema({field("l_key", type), field("l_payload", type)})};
+  }
+
+  // Right side composed of num_batches_right identical batches of
+  // num_rows_per_batch_right rows containing only 1 value_match.
+  BatchesWithSchema batches_right;
+  {
+    // A column with (num_rows_per_batch_right - 1) non-value_match-es (possibly null) and
+    // 1 value_match.
+    auto non_matches = RandomArrayGenerator(kSeedMax).FixedSizeBinary(
+        num_rows_per_batch_right - 1, fixed_length,
+        /*null_probability =*/0.01, /*min_byte=*/byte_no_match_min,
+        /*max_byte=*/byte_no_match_max);
+    ASSERT_OK_AND_ASSIGN(auto match, Constant(value_match)->Generate(1));
+    ASSERT_OK_AND_ASSIGN(auto column, Concatenate({non_matches, match}));
+
+    // Use the column as both the key and the payload.
+    ExecBatch batch({column, column}, num_rows_per_batch_right);
+    batches_right =
+        BatchesWithSchema{std::vector<ExecBatch>(num_batches_right, std::move(batch)),
+                          schema({field("r_key", type), field("r_payload", type)})};
+  }
+
+  Declaration left{"exec_batch_source",
+                   ExecBatchSourceNodeOptions(std::move(batches_left.schema),
+                                              std::move(batches_left.batches))};
+
+  Declaration right{"exec_batch_source",
+                    ExecBatchSourceNodeOptions(std::move(batches_right.schema),
+                                               std::move(batches_right.batches))};
+
+  HashJoinNodeOptions join_opts(JoinType::INNER, /*left_keys=*/{"l_key"},
+                                /*right_keys=*/{"r_key"});
+  Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts};
+
+  ASSERT_OK_AND_ASSIGN(auto batches_result, DeclarationToExecBatches(std::move(join)));
+  Declaration result{"exec_batch_source",
+                     ExecBatchSourceNodeOptions(std::move(batches_result.schema),
+                                                std::move(batches_result.batches))};
+
+  // The row count of hash join should be (number of value_match-es in left side) *
+  // (number of value_match-es in right side).
+  AssertRowCountEq(result,
+                   num_batches_left * num_rows_per_batch_left * num_batches_right);
+
+  // All rows should be value_match-es.
+  auto predicate = and_({equal(field_ref("l_key"), literal(value_match)),
+                         equal(field_ref("l_payload"), literal(value_match)),
+                         equal(field_ref("r_key"), literal(value_match)),
+                         equal(field_ref("r_payload"), literal(value_match))});
+  Declaration filter{"filter", {result}, FilterNodeOptions{std::move(predicate)}};
+  AssertRowCountEq(std::move(filter),
+                   num_batches_left * num_rows_per_batch_left * num_batches_right);
+}
+
+// GH-43495: Test that both the key and the payload of the right side (the build side) are
+// var length and larger than 4GB, and the 64-bit offset in the hash table can handle it
+// correctly.
+TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) {
+  constexpr int64_t k5GB = 5ll * 1024 * 1024 * 1024;
+  const auto type = utf8();
+  constexpr int value_no_match_length_min = 128;
+  constexpr int value_no_match_length_max = 129;
+  constexpr int value_match_length = 130;
+  const auto value_match =
+      std::make_shared<StringScalar>(std::string(value_match_length, 'X'));
+  constexpr int16_t num_rows_per_batch_left = 128;
+  constexpr int16_t num_rows_per_batch_right = 4096;
+  const int64_t num_batches_left = 8;
+  const int64_t num_batches_right =
+      k5GB / (num_rows_per_batch_right * value_no_match_length_min);
+
+  // Left side composed of num_batches_left identical batches of num_rows_per_batch_left
+  // rows of value_match-es.
+  BatchesWithSchema batches_left;
+  {
+    // A column with num_rows_per_batch_left value_match-es.
+    ASSERT_OK_AND_ASSIGN(auto column,
+                         Constant(value_match)->Generate(num_rows_per_batch_left));
+
+    // Use the column as both the key and the payload.
+    ExecBatch batch({column, column}, num_rows_per_batch_left);
+    batches_left =
+        BatchesWithSchema{std::vector<ExecBatch>(num_batches_left, std::move(batch)),
+                          schema({field("l_key", type), field("l_payload", type)})};
+  }
+
+  // Right side composed of num_batches_right identical batches of
+  // num_rows_per_batch_right rows containing only 1 value_match.
+  BatchesWithSchema batches_right;
+  {
+    // A column with (num_rows_per_batch_right - 1) non-value_match-es (possibly null) and
+    // 1 value_match.
+    auto non_matches =
+        RandomArrayGenerator(kSeedMax).String(num_rows_per_batch_right - 1,
+                                              /*min_length=*/value_no_match_length_min,
+                                              /*max_length=*/value_no_match_length_max,
+                                              /*null_probability =*/0.01);
+    ASSERT_OK_AND_ASSIGN(auto match, Constant(value_match)->Generate(1));
+    ASSERT_OK_AND_ASSIGN(auto column, Concatenate({non_matches, match}));
+
+    // Use the column as both the key and the payload.
+    ExecBatch batch({column, column}, num_rows_per_batch_right);
+    batches_right =
+        BatchesWithSchema{std::vector<ExecBatch>(num_batches_right, std::move(batch)),
+                          schema({field("r_key", type), field("r_payload", type)})};
+  }
+
+  Declaration left{"exec_batch_source",
+                   ExecBatchSourceNodeOptions(std::move(batches_left.schema),
+                                              std::move(batches_left.batches))};
+
+  Declaration right{"exec_batch_source",
+                    ExecBatchSourceNodeOptions(std::move(batches_right.schema),
+                                               std::move(batches_right.batches))};
+
+  HashJoinNodeOptions join_opts(JoinType::INNER, /*left_keys=*/{"l_key"},
+                                /*right_keys=*/{"r_key"});
+  Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts};
+
+  ASSERT_OK_AND_ASSIGN(auto batches_result, DeclarationToExecBatches(std::move(join)));
+  Declaration result{"exec_batch_source",
+                     ExecBatchSourceNodeOptions(std::move(batches_result.schema),
+                                                std::move(batches_result.batches))};
+
+  // The row count of hash join should be (number of value_match-es in left side) *
+  // (number of value_match-es in right side).
+  AssertRowCountEq(result,
+                   num_batches_left * num_rows_per_batch_left * num_batches_right);
+
+  // All rows should be value_match-es.
+  auto predicate = and_({equal(field_ref("l_key"), literal(value_match)),
+                         equal(field_ref("l_payload"), literal(value_match)),
+                         equal(field_ref("r_key"), literal(value_match)),
+                         equal(field_ref("r_payload"), literal(value_match))});
+  Declaration filter{"filter", {result}, FilterNodeOptions{std::move(predicate)}};
+  AssertRowCountEq(std::move(filter),
+                   num_batches_left * num_rows_per_batch_left * num_batches_right);
+}
+
 }  // namespace acero
 }  // namespace arrow
diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc
index 732deb72861..40a4b5886e4 100644
--- a/cpp/src/arrow/acero/swiss_join.cc
+++ b/cpp/src/arrow/acero/swiss_join.cc
@@ -122,7 +122,7 @@ void RowArrayAccessor::Visit(const RowTableImpl& rows, int column_id, int num_ro
   if (!is_fixed_length_column) {
     int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id);
     const uint8_t* row_ptr_base = rows.data(2);
-    const uint32_t* row_offsets = rows.offsets();
+    const RowTableImpl::offset_type* row_offsets = rows.offsets();
     uint32_t field_offset_within_row, field_length;
 
     if (varbinary_column_id == 0) {
@@ -173,7 +173,7 @@ void RowArrayAccessor::Visit(const RowTableImpl& rows, int column_id, int num_ro
       // Case 4: This is a fixed length column in a varying length row
       //
       const uint8_t* row_ptr_base = rows.data(2) + field_offset_within_row;
-      const uint32_t* row_offsets = rows.offsets();
+      const RowTableImpl::offset_type* row_offsets = rows.offsets();
       for (int i = 0; i < num_rows; ++i) {
         uint32_t row_id = row_ids[i];
         const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id];
@@ -473,17 +473,10 @@ Status RowArrayMerge::PrepareForMerge(RowArray* target,
     (*first_target_row_id)[sources.size()] = num_rows;
   }
 
-  if (num_bytes > std::numeric_limits<uint32_t>::max()) {
-    return Status::Invalid(
-        "There are more than 2^32 bytes of key data.  Acero cannot "
-        "process a join of this magnitude");
-  }
-
   // Allocate target memory
   //
   target->rows_.Clean();
-  RETURN_NOT_OK(target->rows_.AppendEmpty(static_cast<uint32_t>(num_rows),
-                                          static_cast<uint32_t>(num_bytes)));
+  RETURN_NOT_OK(target->rows_.AppendEmpty(static_cast<uint32_t>(num_rows), num_bytes));
 
   // In case of varying length rows,
   // initialize the first row offset for each range of rows corresponding to a
@@ -565,15 +558,15 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl&
                                       int64_t first_target_row_offset,
                                       const int64_t* source_rows_permutation) {
   int64_t num_source_rows = source.length();
-  uint32_t* target_offsets = target->mutable_offsets();
-  const uint32_t* source_offsets = source.offsets();
+  RowTableImpl::offset_type* target_offsets = target->mutable_offsets();
+  const RowTableImpl::offset_type* source_offsets = source.offsets();
 
   // Permutation of source rows is optional.
   //
   if (!source_rows_permutation) {
     int64_t target_row_offset = first_target_row_offset;
     for (int64_t i = 0; i < num_source_rows; ++i) {
-      target_offsets[first_target_row_id + i] = static_cast<uint32_t>(target_row_offset);
+      target_offsets[first_target_row_id + i] = target_row_offset;
       target_row_offset += source_offsets[i + 1] - source_offsets[i];
     }
     // We purposefully skip outputting of N+1 offset, to allow concurrent
@@ -593,7 +586,10 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl&
       int64_t source_row_id = source_rows_permutation[i];
       const uint64_t* source_row_ptr = reinterpret_cast<const uint64_t*>(
           source.data(2) + source_offsets[source_row_id]);
-      uint32_t length = source_offsets[source_row_id + 1] - source_offsets[source_row_id];
+      int64_t length = source_offsets[source_row_id + 1] - source_offsets[source_row_id];
+      // Though the row offset is 64-bit, the length of a single row must be 32-bit as
+      // required by current row table implementation.
+      DCHECK_LE(length, std::numeric_limits<uint32_t>::max());
 
       // Rows should be 64-bit aligned.
       // In that case we can copy them using a sequence of 64-bit read/writes.
@@ -604,7 +600,7 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl&
         *target_row_ptr++ = *source_row_ptr++;
       }
 
-      target_offsets[first_target_row_id + i] = static_cast<uint32_t>(target_row_offset);
+      target_offsets[first_target_row_id + i] = target_row_offset;
       target_row_offset += length;
     }
   }
diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc
index 0888dd89384..e42b0b40445 100644
--- a/cpp/src/arrow/acero/swiss_join_avx2.cc
+++ b/cpp/src/arrow/acero/swiss_join_avx2.cc
@@ -23,6 +23,9 @@
 namespace arrow {
 namespace acero {
 
+// TODO(GH-43693): The functions in this file are not wired anywhere. We may consider
+// actually utilizing them or removing them.
+
 template <class PROCESS_8_VALUES_FN>
 int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int num_rows,
                                  const uint32_t* row_ids,
@@ -45,48 +48,78 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
   if (!is_fixed_length_column) {
     int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id);
     const uint8_t* row_ptr_base = rows.data(2);
-    const uint32_t* row_offsets = rows.offsets();
+    const RowTableImpl::offset_type* row_offsets = rows.offsets();
+    static_assert(
+        sizeof(RowTableImpl::offset_type) == sizeof(int64_t),
+        "RowArrayAccessor::Visit_avx2 only supports 64-bit RowTableImpl::offset_type");
 
     if (varbinary_column_id == 0) {
       // Case 1: This is the first varbinary column
       //
       __m256i field_offset_within_row = _mm256_set1_epi32(rows.metadata().fixed_length);
       __m256i varbinary_end_array_offset =
-          _mm256_set1_epi32(rows.metadata().varbinary_end_array_offset);
+          _mm256_set1_epi64x(rows.metadata().varbinary_end_array_offset);
       for (int i = 0; i < num_rows / unroll; ++i) {
+        // Load 8 32-bit row ids.
         __m256i row_id =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i);
-        __m256i row_offset = _mm256_i32gather_epi32(
-            reinterpret_cast<const int*>(row_offsets), row_id, sizeof(uint32_t));
+        // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit
+        // row ids.
+        __m256i row_offset_lo =
+            _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id),
+                                   sizeof(RowTableImpl::offset_type));
+        __m256i row_offset_hi =
+            _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1),
+                                   sizeof(RowTableImpl::offset_type));
+        // Gather the lower/higher 4 32-bit field lengths based on the lower/higher 4
+        // 64-bit row offsets.
+        __m128i field_length_lo = _mm256_i64gather_epi32(
+            reinterpret_cast<const int*>(row_ptr_base),
+            _mm256_add_epi64(row_offset_lo, varbinary_end_array_offset), 1);
+        __m128i field_length_hi = _mm256_i64gather_epi32(
+            reinterpret_cast<const int*>(row_ptr_base),
+            _mm256_add_epi64(row_offset_hi, varbinary_end_array_offset), 1);
+        // The final 8 32-bit field lengths, subtracting the field offset within row.
         __m256i field_length = _mm256_sub_epi32(
-            _mm256_i32gather_epi32(
-                reinterpret_cast<const int*>(row_ptr_base),
-                _mm256_add_epi32(row_offset, varbinary_end_array_offset), 1),
-            field_offset_within_row);
+            _mm256_set_m128i(field_length_hi, field_length_lo), field_offset_within_row);
         process_8_values_fn(i * unroll, row_ptr_base,
-                            _mm256_add_epi32(row_offset, field_offset_within_row),
+                            _mm256_add_epi64(row_offset_lo, field_offset_within_row),
+                            _mm256_add_epi64(row_offset_hi, field_offset_within_row),
                             field_length);
       }
     } else {
       // Case 2: This is second or later varbinary column
       //
       __m256i varbinary_end_array_offset =
-          _mm256_set1_epi32(rows.metadata().varbinary_end_array_offset +
-                            sizeof(uint32_t) * (varbinary_column_id - 1));
+          _mm256_set1_epi64x(rows.metadata().varbinary_end_array_offset +
+                             sizeof(uint32_t) * (varbinary_column_id - 1));
       auto row_ptr_base_i64 =
           reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_ptr_base);
       for (int i = 0; i < num_rows / unroll; ++i) {
+        // Load 8 32-bit row ids.
         __m256i row_id =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i);
-        __m256i row_offset = _mm256_i32gather_epi32(
-            reinterpret_cast<const int*>(row_offsets), row_id, sizeof(uint32_t));
-        __m256i end_array_offset =
-            _mm256_add_epi32(row_offset, varbinary_end_array_offset);
-
-        __m256i field_offset_within_row_A = _mm256_i32gather_epi64(
-            row_ptr_base_i64, _mm256_castsi256_si128(end_array_offset), 1);
-        __m256i field_offset_within_row_B = _mm256_i32gather_epi64(
-            row_ptr_base_i64, _mm256_extracti128_si256(end_array_offset, 1), 1);
+        // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit
+        // row ids.
+        __m256i row_offset_lo =
+            _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id),
+                                   sizeof(RowTableImpl::offset_type));
+        // Gather the lower/higher 4 32-bit field lengths based on the lower/higher 4
+        // 64-bit row offsets.
+        __m256i row_offset_hi =
+            _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1),
+                                   sizeof(RowTableImpl::offset_type));
+        // Prepare the lower/higher 4 64-bit end array offsets based on the lower/higher 4
+        // 64-bit row offsets.
+        __m256i end_array_offset_lo =
+            _mm256_add_epi64(row_offset_lo, varbinary_end_array_offset);
+        __m256i end_array_offset_hi =
+            _mm256_add_epi64(row_offset_hi, varbinary_end_array_offset);
+
+        __m256i field_offset_within_row_A =
+            _mm256_i64gather_epi64(row_ptr_base_i64, end_array_offset_lo, 1);
+        __m256i field_offset_within_row_B =
+            _mm256_i64gather_epi64(row_ptr_base_i64, end_array_offset_hi, 1);
         field_offset_within_row_A = _mm256_permutevar8x32_epi32(
             field_offset_within_row_A, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
         field_offset_within_row_B = _mm256_permutevar8x32_epi32(
@@ -110,8 +143,14 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
                                                 0x4e);  // Swapping low and high 128-bits
         field_length = _mm256_sub_epi32(field_length, field_offset_within_row);
 
+        field_offset_within_row_A =
+            _mm256_add_epi32(field_offset_within_row_A, alignment_padding);
+        field_offset_within_row_B =
+            _mm256_add_epi32(field_offset_within_row_B, alignment_padding);
+
         process_8_values_fn(i * unroll, row_ptr_base,
-                            _mm256_add_epi32(row_offset, field_offset_within_row),
+                            _mm256_add_epi64(row_offset_lo, field_offset_within_row_A),
+                            _mm256_add_epi64(row_offset_hi, field_offset_within_row_B),
                             field_length);
       }
     }
@@ -119,7 +158,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
 
   if (is_fixed_length_column) {
     __m256i field_offset_within_row =
-        _mm256_set1_epi32(rows.metadata().encoded_field_offset(
+        _mm256_set1_epi64x(rows.metadata().encoded_field_offset(
             rows.metadata().pos_after_encoding(column_id)));
     __m256i field_length =
         _mm256_set1_epi32(rows.metadata().column_metadatas[column_id].fixed_length);
@@ -130,24 +169,51 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
       //
       const uint8_t* row_ptr_base = rows.data(1);
       for (int i = 0; i < num_rows / unroll; ++i) {
+        // Load 8 32-bit row ids.
         __m256i row_id =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i);
-        __m256i row_offset = _mm256_mullo_epi32(row_id, field_length);
-        __m256i field_offset = _mm256_add_epi32(row_offset, field_offset_within_row);
-        process_8_values_fn(i * unroll, row_ptr_base, field_offset, field_length);
+        // Widen the 32-bit row ids to 64-bit and store the lower/higher 4 of them into 2
+        // 256-bit registers.
+        __m256i row_id_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(row_id));
+        __m256i row_id_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(row_id, 1));
+        // Calculate the lower/higher 4 64-bit row offsets based on the lower/higher 4
+        // 64-bit row ids and the fixed field length.
+        __m256i row_offset_lo = _mm256_mul_epi32(row_id_lo, field_length);
+        __m256i row_offset_hi = _mm256_mul_epi32(row_id_hi, field_length);
+        // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4
+        // 64-bit row offsets and field offset within row.
+        __m256i field_offset_lo =
+            _mm256_add_epi64(row_offset_lo, field_offset_within_row);
+        __m256i field_offset_hi =
+            _mm256_add_epi64(row_offset_hi, field_offset_within_row);
+        process_8_values_fn(i * unroll, row_ptr_base, field_offset_lo, field_offset_hi,
+                            field_length);
       }
     } else {
       // Case 4: This is a fixed length column in varying length row
       //
       const uint8_t* row_ptr_base = rows.data(2);
-      const uint32_t* row_offsets = rows.offsets();
+      const RowTableImpl::offset_type* row_offsets = rows.offsets();
       for (int i = 0; i < num_rows / unroll; ++i) {
+        // Load 8 32-bit row ids.
         __m256i row_id =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i);
-        __m256i row_offset = _mm256_i32gather_epi32(
-            reinterpret_cast<const int*>(row_offsets), row_id, sizeof(uint32_t));
-        __m256i field_offset = _mm256_add_epi32(row_offset, field_offset_within_row);
-        process_8_values_fn(i * unroll, row_ptr_base, field_offset, field_length);
+        // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit
+        // row ids.
+        __m256i row_offset_lo =
+            _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id),
+                                   sizeof(RowTableImpl::offset_type));
+        __m256i row_offset_hi =
+            _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1),
+                                   sizeof(RowTableImpl::offset_type));
+        // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4
+        // 64-bit row offsets and field offset within row.
+        __m256i field_offset_lo =
+            _mm256_add_epi64(row_offset_lo, field_offset_within_row);
+        __m256i field_offset_hi =
+            _mm256_add_epi64(row_offset_hi, field_offset_within_row);
+        process_8_values_fn(i * unroll, row_ptr_base, field_offset_lo, field_offset_hi,
+                            field_length);
       }
     }
   }
diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc
index 98aea901126..5e1a87b7952 100644
--- a/cpp/src/arrow/compute/row/compare_internal.cc
+++ b/cpp/src/arrow/compute/row/compare_internal.cc
@@ -104,18 +104,21 @@ void KeyCompare::CompareBinaryColumnToRowHelper(
     const uint8_t* rows_right = rows.data(1);
     for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) {
       uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i;
-      uint32_t irow_right = left_to_right_map[irow_left];
-      uint32_t offset_right = irow_right * fixed_length + offset_within_row;
+      // irow_right is used to index into row data so promote to the row offset type.
+      RowTableImpl::offset_type irow_right = left_to_right_map[irow_left];
+      RowTableImpl::offset_type offset_right =
+          irow_right * fixed_length + offset_within_row;
       match_bytevector[i] = compare_fn(rows_left, rows_right, irow_left, offset_right);
     }
   } else {
     const uint8_t* rows_left = col.data(1);
-    const uint32_t* offsets_right = rows.offsets();
+    const RowTableImpl::offset_type* offsets_right = rows.offsets();
     const uint8_t* rows_right = rows.data(2);
     for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) {
       uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i;
       uint32_t irow_right = left_to_right_map[irow_left];
-      uint32_t offset_right = offsets_right[irow_right] + offset_within_row;
+      RowTableImpl::offset_type offset_right =
+          offsets_right[irow_right] + offset_within_row;
       match_bytevector[i] = compare_fn(rows_left, rows_right, irow_left, offset_right);
     }
   }
@@ -145,7 +148,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row,
         offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null,
         left_to_right_map, ctx, col, rows, match_bytevector,
         [bit_offset](const uint8_t* left_base, const uint8_t* right_base,
-                     uint32_t irow_left, uint32_t offset_right) {
+                     uint32_t irow_left, RowTableImpl::offset_type offset_right) {
           uint8_t left =
               bit_util::GetBit(left_base, irow_left + bit_offset) ? 0xff : 0x00;
           uint8_t right = right_base[offset_right];
@@ -156,7 +159,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row,
         offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null,
         left_to_right_map, ctx, col, rows, match_bytevector,
         [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left,
-           uint32_t offset_right) {
+           RowTableImpl::offset_type offset_right) {
           uint8_t left = left_base[irow_left];
           uint8_t right = right_base[offset_right];
           return left == right ? 0xff : 0;
@@ -166,7 +169,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row,
         offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null,
         left_to_right_map, ctx, col, rows, match_bytevector,
         [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left,
-           uint32_t offset_right) {
+           RowTableImpl::offset_type offset_right) {
           util::CheckAlignment<uint16_t>(left_base);
           util::CheckAlignment<uint16_t>(right_base + offset_right);
           uint16_t left = reinterpret_cast<const uint16_t*>(left_base)[irow_left];
@@ -178,7 +181,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row,
         offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null,
         left_to_right_map, ctx, col, rows, match_bytevector,
         [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left,
-           uint32_t offset_right) {
+           RowTableImpl::offset_type offset_right) {
           util::CheckAlignment<uint32_t>(left_base);
           util::CheckAlignment<uint32_t>(right_base + offset_right);
           uint32_t left = reinterpret_cast<const uint32_t*>(left_base)[irow_left];
@@ -190,7 +193,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row,
         offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null,
         left_to_right_map, ctx, col, rows, match_bytevector,
         [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left,
-           uint32_t offset_right) {
+           RowTableImpl::offset_type offset_right) {
           util::CheckAlignment<uint64_t>(left_base);
           util::CheckAlignment<uint64_t>(right_base + offset_right);
           uint64_t left = reinterpret_cast<const uint64_t*>(left_base)[irow_left];
@@ -202,7 +205,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row,
         offset_within_row, num_processed, num_rows_to_compare, sel_left_maybe_null,
         left_to_right_map, ctx, col, rows, match_bytevector,
         [&col](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left,
-               uint32_t offset_right) {
+               RowTableImpl::offset_type offset_right) {
           uint32_t length = col.metadata().fixed_length;
 
           // Non-zero length guarantees no underflow
@@ -241,7 +244,7 @@ void KeyCompare::CompareVarBinaryColumnToRowHelper(
     const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col,
     const RowTableImpl& rows, uint8_t* match_bytevector) {
   const uint32_t* offsets_left = col.offsets();
-  const uint32_t* offsets_right = rows.offsets();
+  const RowTableImpl::offset_type* offsets_right = rows.offsets();
   const uint8_t* rows_left = col.data(2);
   const uint8_t* rows_right = rows.data(2);
   for (uint32_t i = first_row_to_compare; i < num_rows_to_compare; ++i) {
@@ -249,7 +252,7 @@ void KeyCompare::CompareVarBinaryColumnToRowHelper(
     uint32_t irow_right = left_to_right_map[irow_left];
     uint32_t begin_left = offsets_left[irow_left];
     uint32_t length_left = offsets_left[irow_left + 1] - begin_left;
-    uint32_t begin_right = offsets_right[irow_right];
+    RowTableImpl::offset_type begin_right = offsets_right[irow_right];
     uint32_t length_right;
     uint32_t offset_within_row;
     if (!is_first_varbinary_col) {
@@ -334,7 +337,13 @@ void KeyCompare::CompareColumnsToRows(
     const RowTableImpl& rows, bool are_cols_in_encoding_order,
     uint8_t* out_match_bitvector_maybe_null) {
   if (num_rows_to_compare == 0) {
-    *out_num_rows = 0;
+    if (out_match_bitvector_maybe_null) {
+      DCHECK_EQ(out_num_rows, nullptr);
+      DCHECK_EQ(out_sel_left_maybe_same, nullptr);
+      bit_util::ClearBitmap(out_match_bitvector_maybe_null, 0, num_rows_to_compare);
+    } else {
+      *out_num_rows = 0;
+    }
     return;
   }
 
@@ -440,8 +449,8 @@ void KeyCompare::CompareColumnsToRows(
                                 match_bytevector_A, match_bitvector);
 
   if (out_match_bitvector_maybe_null) {
-    ARROW_DCHECK(out_num_rows == nullptr);
-    ARROW_DCHECK(out_sel_left_maybe_same == nullptr);
+    DCHECK_EQ(out_num_rows, nullptr);
+    DCHECK_EQ(out_sel_left_maybe_same, nullptr);
     memcpy(out_match_bitvector_maybe_null, match_bitvector,
            bit_util::BytesForBits(num_rows_to_compare));
   } else {
diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h
index a5a109b0b51..29d7f859e59 100644
--- a/cpp/src/arrow/compute/row/compare_internal.h
+++ b/cpp/src/arrow/compute/row/compare_internal.h
@@ -42,9 +42,30 @@ class ARROW_EXPORT KeyCompare {
            /*extra=*/util::MiniBatch::kMiniBatchLength;
   }
 
-  // Returns a single 16-bit selection vector of rows that failed comparison.
-  // If there is input selection on the left, the resulting selection is a filtered image
-  // of input selection.
+  /// \brief Compare a batch of rows in columnar format to the specified rows in row
+  /// format.
+  ///
+  /// The comparison result is populated in either a 16-bit selection vector of rows that
+  /// failed comparison, or a match bitvector with 1 for matched rows and 0 otherwise.
+  ///
+  /// @param num_rows_to_compare The number of rows to compare.
+  /// @param sel_left_maybe_null Optional input selection vector on the left, the
+  ///        comparison is only performed on the selected rows. Null if all rows in
+  ///        `left_to_right_map` are to be compared.
+  /// @param left_to_right_map The mapping from the left to the right rows. Left row `i`
+  ///        in `cols` is compared to right row `left_to_right_map[i]` in `row`.
+  /// @param ctx The light context needed for the comparison.
+  /// @param out_num_rows The number of rows that failed comparison. Must be null if
+  ///        `out_match_bitvector_maybe_null` is not null.
+  /// @param out_sel_left_maybe_same The selection vector of rows that failed comparison.
+  ///        Can be the same as `sel_left_maybe_null` for in-place update. Must be null if
+  ///        `out_match_bitvector_maybe_null` is not null.
+  /// @param cols The left rows in columnar format to compare.
+  /// @param rows The right rows in row format to compare.
+  /// @param are_cols_in_encoding_order Whether the columns are in encoding order.
+  /// @param out_match_bitvector_maybe_null The optional output match bitvector, 1 for
+  ///        matched rows and 0 otherwise. Won't be populated if `out_num_rows` and
+  ///        `out_sel_left_maybe_same` are not null.
   static void CompareColumnsToRows(
       uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null,
       const uint32_t* left_to_right_map, LightContext* ctx, uint32_t* out_num_rows,
diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc
index 23238a3691c..96eed6fc03a 100644
--- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc
+++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc
@@ -180,40 +180,6 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2(
   }
 }
 
-namespace {
-
-// Intrinsics `_mm256_i32gather_epi32/64` treat the `vindex` as signed integer, and we
-// are using `uint32_t` to represent the offset, in range of [0, 4G), within the row
-// table. When the offset is larger than `0x80000000` (2GB), those intrinsics will treat
-// it as negative offset and gather the data from undesired address. To avoid this issue,
-// we normalize the addresses by translating `base` `0x80000000` higher, and `offset`
-// `0x80000000` lower. This way, the offset is always in range of [-2G, 2G) and those
-// intrinsics are safe.
-
-constexpr uint64_t kTwoGB = 0x80000000ull;
-
-template <uint32_t kScale>
-inline __m256i UnsignedOffsetSafeGather32(int const* base, __m256i offset) {
-  int const* normalized_base = base + kTwoGB / sizeof(int);
-  __m256i normalized_offset =
-      _mm256_sub_epi32(offset, _mm256_set1_epi32(static_cast<int>(kTwoGB / kScale)));
-  return _mm256_i32gather_epi32(normalized_base, normalized_offset,
-                                static_cast<int>(kScale));
-}
-
-template <uint32_t kScale>
-inline __m256i UnsignedOffsetSafeGather64(arrow::util::int64_for_gather_t const* base,
-                                          __m128i offset) {
-  arrow::util::int64_for_gather_t const* normalized_base =
-      base + kTwoGB / sizeof(arrow::util::int64_for_gather_t);
-  __m128i normalized_offset =
-      _mm_sub_epi32(offset, _mm_set1_epi32(static_cast<int>(kTwoGB / kScale)));
-  return _mm256_i32gather_epi64(normalized_base, normalized_offset,
-                                static_cast<int>(kScale));
-}
-
-}  // namespace
-
 template <bool use_selection, class COMPARE8_FN>
 uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2(
     uint32_t offset_within_row, uint32_t num_rows_to_compare,
@@ -240,12 +206,26 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2(
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(left_to_right_map) + i);
       }
 
-      __m256i offset_right =
-          _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(fixed_length));
-      offset_right = _mm256_add_epi32(offset_right, _mm256_set1_epi32(offset_within_row));
-
-      reinterpret_cast<uint64_t*>(match_bytevector)[i] =
-          compare8_fn(rows_left, rows_right, i * unroll, irow_left, offset_right);
+      // Widen the 32-bit row ids to 64-bit and store the first/last 4 of them into 2
+      // 256-bit registers.
+      __m256i irow_right_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(irow_right));
+      __m256i irow_right_hi =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(irow_right, 1));
+      // Calculate the lower/higher 4 64-bit row offsets based on the lower/higher 4
+      // 64-bit row ids and the fixed length.
+      __m256i offset_right_lo =
+          _mm256_mul_epi32(irow_right_lo, _mm256_set1_epi64x(fixed_length));
+      __m256i offset_right_hi =
+          _mm256_mul_epi32(irow_right_hi, _mm256_set1_epi64x(fixed_length));
+      // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4
+      // 64-bit row offsets and field offset within row.
+      offset_right_lo =
+          _mm256_add_epi64(offset_right_lo, _mm256_set1_epi64x(offset_within_row));
+      offset_right_hi =
+          _mm256_add_epi64(offset_right_hi, _mm256_set1_epi64x(offset_within_row));
+
+      reinterpret_cast<uint64_t*>(match_bytevector)[i] = compare8_fn(
+          rows_left, rows_right, i * unroll, irow_left, offset_right_lo, offset_right_hi);
 
       if (!use_selection) {
         irow_left = _mm256_add_epi32(irow_left, _mm256_set1_epi32(8));
@@ -254,7 +234,7 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2(
     return num_rows_to_compare - (num_rows_to_compare % unroll);
   } else {
     const uint8_t* rows_left = col.data(1);
-    const uint32_t* offsets_right = rows.offsets();
+    const RowTableImpl::offset_type* offsets_right = rows.offsets();
     const uint8_t* rows_right = rows.data(2);
     constexpr uint32_t unroll = 8;
     __m256i irow_left = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
@@ -270,12 +250,29 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2(
         irow_right =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(left_to_right_map) + i);
       }
-      __m256i offset_right =
-          UnsignedOffsetSafeGather32<4>((int const*)offsets_right, irow_right);
-      offset_right = _mm256_add_epi32(offset_right, _mm256_set1_epi32(offset_within_row));
 
-      reinterpret_cast<uint64_t*>(match_bytevector)[i] =
-          compare8_fn(rows_left, rows_right, i * unroll, irow_left, offset_right);
+      static_assert(sizeof(RowTableImpl::offset_type) == sizeof(int64_t),
+                    "KeyCompare::CompareBinaryColumnToRowHelper_avx2 only supports "
+                    "64-bit RowTableImpl::offset_type");
+      auto offsets_right_i64 =
+          reinterpret_cast<const arrow::util::int64_for_gather_t*>(offsets_right);
+      // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit
+      // row ids.
+      __m256i offset_right_lo =
+          _mm256_i32gather_epi64(offsets_right_i64, _mm256_castsi256_si128(irow_right),
+                                 sizeof(RowTableImpl::offset_type));
+      __m256i offset_right_hi = _mm256_i32gather_epi64(
+          offsets_right_i64, _mm256_extracti128_si256(irow_right, 1),
+          sizeof(RowTableImpl::offset_type));
+      // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4
+      // 64-bit row offsets and field offset within row.
+      offset_right_lo =
+          _mm256_add_epi64(offset_right_lo, _mm256_set1_epi64x(offset_within_row));
+      offset_right_hi =
+          _mm256_add_epi64(offset_right_hi, _mm256_set1_epi64x(offset_within_row));
+
+      reinterpret_cast<uint64_t*>(match_bytevector)[i] = compare8_fn(
+          rows_left, rows_right, i * unroll, irow_left, offset_right_lo, offset_right_hi);
 
       if (!use_selection) {
         irow_left = _mm256_add_epi32(irow_left, _mm256_set1_epi32(8));
@@ -287,8 +284,8 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2(
 
 template <int column_width>
 inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* right_base,
-                                      __m256i irow_left, __m256i offset_right,
-                                      int bit_offset = 0) {
+                                      __m256i irow_left, __m256i offset_right_lo,
+                                      __m256i offset_right_hi, int bit_offset = 0) {
   __m256i left;
   switch (column_width) {
     case 0: {
@@ -315,7 +312,9 @@ inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* r
       ARROW_DCHECK(false);
   }
 
-  __m256i right = UnsignedOffsetSafeGather32<1>((int const*)right_base, offset_right);
+  __m128i right_lo = _mm256_i64gather_epi32((int const*)right_base, offset_right_lo, 1);
+  __m128i right_hi = _mm256_i64gather_epi32((int const*)right_base, offset_right_hi, 1);
+  __m256i right = _mm256_set_m128i(right_hi, right_lo);
   if (column_width != sizeof(uint32_t)) {
     constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff;
     right = _mm256_and_si256(right, _mm256_set1_epi32(mask));
@@ -333,8 +332,8 @@ inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* r
 
 template <int column_width>
 inline uint64_t Compare8_avx2(const uint8_t* left_base, const uint8_t* right_base,
-                              uint32_t irow_left_first, __m256i offset_right,
-                              int bit_offset = 0) {
+                              uint32_t irow_left_first, __m256i offset_right_lo,
+                              __m256i offset_right_hi, int bit_offset = 0) {
   __m256i left;
   switch (column_width) {
     case 0: {
@@ -364,7 +363,9 @@ inline uint64_t Compare8_avx2(const uint8_t* left_base, const uint8_t* right_bas
       ARROW_DCHECK(false);
   }
 
-  __m256i right = UnsignedOffsetSafeGather32<1>((int const*)right_base, offset_right);
+  __m128i right_lo = _mm256_i64gather_epi32((int const*)right_base, offset_right_lo, 1);
+  __m128i right_hi = _mm256_i64gather_epi32((int const*)right_base, offset_right_hi, 1);
+  __m256i right = _mm256_set_m128i(right_hi, right_lo);
   if (column_width != sizeof(uint32_t)) {
     constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff;
     right = _mm256_and_si256(right, _mm256_set1_epi32(mask));
@@ -383,7 +384,7 @@ inline uint64_t Compare8_avx2(const uint8_t* left_base, const uint8_t* right_bas
 template <bool use_selection>
 inline uint64_t Compare8_64bit_avx2(const uint8_t* left_base, const uint8_t* right_base,
                                     __m256i irow_left, uint32_t irow_left_first,
-                                    __m256i offset_right) {
+                                    __m256i offset_right_lo, __m256i offset_right_hi) {
   auto left_base_i64 =
       reinterpret_cast<const arrow::util::int64_for_gather_t*>(left_base);
   __m256i left_lo, left_hi;
@@ -400,10 +401,8 @@ inline uint64_t Compare8_64bit_avx2(const uint8_t* left_base, const uint8_t* rig
   }
   auto right_base_i64 =
       reinterpret_cast<const arrow::util::int64_for_gather_t*>(right_base);
-  __m256i right_lo =
-      UnsignedOffsetSafeGather64<1>(right_base_i64, _mm256_castsi256_si128(offset_right));
-  __m256i right_hi = UnsignedOffsetSafeGather64<1>(
-      right_base_i64, _mm256_extracti128_si256(offset_right, 1));
+  __m256i right_lo = _mm256_i64gather_epi64(right_base_i64, offset_right_lo, 1);
+  __m256i right_hi = _mm256_i64gather_epi64(right_base_i64, offset_right_hi, 1);
   uint32_t result_lo = _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_lo, right_lo));
   uint32_t result_hi = _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_hi, right_hi));
   return result_lo | (static_cast<uint64_t>(result_hi) << 32);
@@ -412,13 +411,19 @@ inline uint64_t Compare8_64bit_avx2(const uint8_t* left_base, const uint8_t* rig
 template <bool use_selection>
 inline uint64_t Compare8_Binary_avx2(uint32_t length, const uint8_t* left_base,
                                      const uint8_t* right_base, __m256i irow_left,
-                                     uint32_t irow_left_first, __m256i offset_right) {
+                                     uint32_t irow_left_first, __m256i offset_right_lo,
+                                     __m256i offset_right_hi) {
   uint32_t irow_left_array[8];
-  uint32_t offset_right_array[8];
+  RowTableImpl::offset_type offset_right_array[8];
   if (use_selection) {
     _mm256_storeu_si256(reinterpret_cast<__m256i*>(irow_left_array), irow_left);
   }
-  _mm256_storeu_si256(reinterpret_cast<__m256i*>(offset_right_array), offset_right);
+  static_assert(
+      sizeof(RowTableImpl::offset_type) * 4 == sizeof(__m256i),
+      "Unexpected RowTableImpl::offset_type size in KeyCompare::Compare8_Binary_avx2");
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(offset_right_array), offset_right_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(&offset_right_array[4]),
+                      offset_right_hi);
 
   // Non-zero length guarantees no underflow
   int32_t num_loops_less_one = (static_cast<int32_t>(length) + 31) / 32 - 1;
@@ -463,13 +468,14 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2(
         offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
         ctx, col, rows, match_bytevector,
         [bit_offset](const uint8_t* left_base, const uint8_t* right_base,
-                     uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) {
+                     uint32_t irow_left_base, __m256i irow_left, __m256i offset_right_lo,
+                     __m256i offset_right_hi) {
           if (use_selection) {
             return CompareSelected8_avx2<0>(left_base, right_base, irow_left,
-                                            offset_right, bit_offset);
+                                            offset_right_lo, offset_right_hi, bit_offset);
           } else {
-            return Compare8_avx2<0>(left_base, right_base, irow_left_base, offset_right,
-                                    bit_offset);
+            return Compare8_avx2<0>(left_base, right_base, irow_left_base,
+                                    offset_right_lo, offset_right_hi, bit_offset);
           }
         });
   } else if (col_width == 1) {
@@ -477,12 +483,13 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2(
         offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
         ctx, col, rows, match_bytevector,
         [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base,
-           __m256i irow_left, __m256i offset_right) {
+           __m256i irow_left, __m256i offset_right_lo, __m256i offset_right_hi) {
           if (use_selection) {
             return CompareSelected8_avx2<1>(left_base, right_base, irow_left,
-                                            offset_right);
+                                            offset_right_lo, offset_right_hi);
           } else {
-            return Compare8_avx2<1>(left_base, right_base, irow_left_base, offset_right);
+            return Compare8_avx2<1>(left_base, right_base, irow_left_base,
+                                    offset_right_lo, offset_right_hi);
           }
         });
   } else if (col_width == 2) {
@@ -490,12 +497,13 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2(
         offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
         ctx, col, rows, match_bytevector,
         [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base,
-           __m256i irow_left, __m256i offset_right) {
+           __m256i irow_left, __m256i offset_right_lo, __m256i offset_right_hi) {
           if (use_selection) {
             return CompareSelected8_avx2<2>(left_base, right_base, irow_left,
-                                            offset_right);
+                                            offset_right_lo, offset_right_hi);
           } else {
-            return Compare8_avx2<2>(left_base, right_base, irow_left_base, offset_right);
+            return Compare8_avx2<2>(left_base, right_base, irow_left_base,
+                                    offset_right_lo, offset_right_hi);
           }
         });
   } else if (col_width == 4) {
@@ -503,12 +511,13 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2(
         offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
         ctx, col, rows, match_bytevector,
         [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base,
-           __m256i irow_left, __m256i offset_right) {
+           __m256i irow_left, __m256i offset_right_lo, __m256i offset_right_hi) {
           if (use_selection) {
             return CompareSelected8_avx2<4>(left_base, right_base, irow_left,
-                                            offset_right);
+                                            offset_right_lo, offset_right_hi);
           } else {
-            return Compare8_avx2<4>(left_base, right_base, irow_left_base, offset_right);
+            return Compare8_avx2<4>(left_base, right_base, irow_left_base,
+                                    offset_right_lo, offset_right_hi);
           }
         });
   } else if (col_width == 8) {
@@ -516,19 +525,22 @@ uint32_t KeyCompare::CompareBinaryColumnToRowImp_avx2(
         offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
         ctx, col, rows, match_bytevector,
         [](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left_base,
-           __m256i irow_left, __m256i offset_right) {
+           __m256i irow_left, __m256i offset_right_lo, __m256i offset_right_hi) {
           return Compare8_64bit_avx2<use_selection>(left_base, right_base, irow_left,
-                                                    irow_left_base, offset_right);
+                                                    irow_left_base, offset_right_lo,
+                                                    offset_right_hi);
         });
   } else {
     return CompareBinaryColumnToRowHelper_avx2<use_selection>(
         offset_within_row, num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
         ctx, col, rows, match_bytevector,
         [&col](const uint8_t* left_base, const uint8_t* right_base,
-               uint32_t irow_left_base, __m256i irow_left, __m256i offset_right) {
+               uint32_t irow_left_base, __m256i irow_left, __m256i offset_right_lo,
+               __m256i offset_right_hi) {
           uint32_t length = col.metadata().fixed_length;
-          return Compare8_Binary_avx2<use_selection>(
-              length, left_base, right_base, irow_left, irow_left_base, offset_right);
+          return Compare8_Binary_avx2<use_selection>(length, left_base, right_base,
+                                                     irow_left, irow_left_base,
+                                                     offset_right_lo, offset_right_hi);
         });
   }
 }
@@ -541,7 +553,7 @@ void KeyCompare::CompareVarBinaryColumnToRowImp_avx2(
     LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows,
     uint8_t* match_bytevector) {
   const uint32_t* offsets_left = col.offsets();
-  const uint32_t* offsets_right = rows.offsets();
+  const RowTableImpl::offset_type* offsets_right = rows.offsets();
   const uint8_t* rows_left = col.data(2);
   const uint8_t* rows_right = rows.data(2);
   for (uint32_t i = 0; i < num_rows_to_compare; ++i) {
@@ -549,7 +561,7 @@ void KeyCompare::CompareVarBinaryColumnToRowImp_avx2(
     uint32_t irow_right = left_to_right_map[irow_left];
     uint32_t begin_left = offsets_left[irow_left];
     uint32_t length_left = offsets_left[irow_left + 1] - begin_left;
-    uint32_t begin_right = offsets_right[irow_right];
+    RowTableImpl::offset_type begin_right = offsets_right[irow_right];
     uint32_t length_right;
     uint32_t offset_within_row;
     if (!is_first_varbinary_col) {
diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc
index 22af7e067d8..5e8ee7c58a7 100644
--- a/cpp/src/arrow/compute/row/compare_test.cc
+++ b/cpp/src/arrow/compute/row/compare_test.cc
@@ -27,7 +27,12 @@ namespace arrow {
 namespace compute {
 
 using arrow::bit_util::BytesForBits;
+using arrow::bit_util::GetBit;
+using arrow::gen::Constant;
+using arrow::gen::Random;
+using arrow::internal::CountSetBits;
 using arrow::internal::CpuInfo;
+using arrow::random::kSeedMax;
 using arrow::random::RandomArrayGenerator;
 using arrow::util::MiniBatch;
 using arrow::util::TempVectorStack;
@@ -106,7 +111,7 @@ TEST(KeyCompare, CompareColumnsToRowsCuriousFSB) {
                                      true, match_bitvector.data());
     for (int i = 0; i < num_rows; ++i) {
       SCOPED_TRACE(i);
-      ASSERT_EQ(arrow::bit_util::GetBit(match_bitvector.data(), i), i != 6);
+      ASSERT_EQ(GetBit(match_bitvector.data(), i), i != 6);
     }
   }
 }
@@ -166,9 +171,111 @@ TEST(KeyCompare, CompareColumnsToRowsTempStackUsage) {
   }
 }
 
+namespace {
+
+Result<RowTableImpl> MakeRowTableFromExecBatch(const ExecBatch& batch) {
+  RowTableImpl row_table;
+
+  std::vector<KeyColumnMetadata> column_metadatas;
+  RETURN_NOT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas));
+  RowTableMetadata table_metadata;
+  table_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t),
+                                          sizeof(uint64_t));
+  RETURN_NOT_OK(row_table.Init(default_memory_pool(), table_metadata));
+  std::vector<uint16_t> row_ids(batch.length);
+  std::iota(row_ids.begin(), row_ids.end(), 0);
+  RowTableEncoder row_encoder;
+  row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t));
+  std::vector<KeyColumnArray> column_arrays;
+  RETURN_NOT_OK(ColumnArraysFromExecBatch(batch, &column_arrays));
+  row_encoder.PrepareEncodeSelected(0, batch.length, column_arrays);
+  RETURN_NOT_OK(row_encoder.EncodeSelected(
+      &row_table, static_cast<uint32_t>(batch.length), row_ids.data()));
+
+  return row_table;
+}
+
+Result<RowTableImpl> RepeatRowTableUntil(const RowTableImpl& seed, int64_t num_rows) {
+  RowTableImpl row_table;
+
+  RETURN_NOT_OK(row_table.Init(default_memory_pool(), seed.metadata()));
+  // Append the seed row table repeatedly to grow the row table to big enough.
+  while (row_table.length() < num_rows) {
+    RETURN_NOT_OK(row_table.AppendSelectionFrom(seed,
+                                                static_cast<uint32_t>(seed.length()),
+                                                /*source_row_ids=*/NULLPTR));
+  }
+
+  return row_table;
+}
+
+void AssertCompareColumnsToRowsAllMatch(const std::vector<KeyColumnArray>& columns,
+                                        const RowTableImpl& row_table,
+                                        const std::vector<uint32_t>& row_ids_to_compare) {
+  uint32_t num_rows_to_compare = static_cast<uint32_t>(row_ids_to_compare.size());
+
+  TempVectorStack stack;
+  ASSERT_OK(
+      stack.Init(default_memory_pool(),
+                 KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows_to_compare)));
+  LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack};
+
+  {
+    // No selection, output no match row ids.
+    uint32_t num_rows_no_match;
+    std::vector<uint16_t> row_ids_out(num_rows_to_compare);
+    KeyCompare::CompareColumnsToRows(num_rows_to_compare, /*sel_left_maybe_null=*/NULLPTR,
+                                     row_ids_to_compare.data(), &ctx, &num_rows_no_match,
+                                     row_ids_out.data(), columns, row_table,
+                                     /*are_cols_in_encoding_order=*/true,
+                                     /*out_match_bitvector_maybe_null=*/NULLPTR);
+    ASSERT_EQ(num_rows_no_match, 0);
+  }
+
+  {
+    // No selection, output match bit vector.
+    std::vector<uint8_t> match_bitvector(BytesForBits(num_rows_to_compare));
+    KeyCompare::CompareColumnsToRows(
+        num_rows_to_compare, /*sel_left_maybe_null=*/NULLPTR, row_ids_to_compare.data(),
+        &ctx,
+        /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns, row_table,
+        /*are_cols_in_encoding_order=*/true, match_bitvector.data());
+    ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows_to_compare),
+              num_rows_to_compare);
+  }
+
+  std::vector<uint16_t> selection_left(num_rows_to_compare);
+  std::iota(selection_left.begin(), selection_left.end(), 0);
+
+  {
+    // With selection, output no match row ids.
+    uint32_t num_rows_no_match;
+    std::vector<uint16_t> row_ids_out(num_rows_to_compare);
+    KeyCompare::CompareColumnsToRows(num_rows_to_compare, selection_left.data(),
+                                     row_ids_to_compare.data(), &ctx, &num_rows_no_match,
+                                     row_ids_out.data(), columns, row_table,
+                                     /*are_cols_in_encoding_order=*/true,
+                                     /*out_match_bitvector_maybe_null=*/NULLPTR);
+    ASSERT_EQ(num_rows_no_match, 0);
+  }
+
+  {
+    // With selection, output match bit vector.
+    std::vector<uint8_t> match_bitvector(BytesForBits(num_rows_to_compare));
+    KeyCompare::CompareColumnsToRows(
+        num_rows_to_compare, selection_left.data(), row_ids_to_compare.data(), &ctx,
+        /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns, row_table,
+        /*are_cols_in_encoding_order=*/true, match_bitvector.data());
+    ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows_to_compare),
+              num_rows_to_compare);
+  }
+}
+
+}  // namespace
+
 // Compare columns to rows at offsets over 2GB within a row table.
 // Certain AVX2 instructions may behave unexpectedly causing troubles like GH-41813.
-TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsLarge)) {
+TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver2GB)) {
   if constexpr (sizeof(void*) == 4) {
     GTEST_SKIP() << "Test only works on 64-bit platforms";
   }
@@ -176,128 +283,194 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsLarge)) {
   // The idea of this case is to create a row table using several fixed length columns and
   // one var length column (so the row is hence var length and has offset buffer), with
   // the overall data size exceeding 2GB. Then compare each row with itself.
-  constexpr int64_t two_gb = 2ll * 1024ll * 1024ll * 1024ll;
+  constexpr int64_t k2GB = 2ll * 1024ll * 1024ll * 1024ll;
   // The compare function requires the row id of the left column to be uint16_t, hence the
   // number of rows.
   constexpr int64_t num_rows = std::numeric_limits<uint16_t>::max() + 1;
   const std::vector<std::shared_ptr<DataType>> fixed_length_types{uint64(), uint32()};
   // The var length column should be a little smaller than 2GB to workaround the capacity
   // limitation in the var length builder.
-  constexpr int32_t var_length = two_gb / num_rows - 1;
+  constexpr int32_t var_length = k2GB / num_rows - 1;
   auto row_size = std::accumulate(fixed_length_types.begin(), fixed_length_types.end(),
                                   static_cast<int64_t>(var_length),
                                   [](int64_t acc, const std::shared_ptr<DataType>& type) {
                                     return acc + type->byte_width();
                                   });
   // The overall size should be larger than 2GB.
-  ASSERT_GT(row_size * num_rows, two_gb);
-
-  MemoryPool* pool = default_memory_pool();
+  ASSERT_GT(row_size * num_rows, k2GB);
 
-  // The left side columns.
-  std::vector<KeyColumnArray> columns_left;
+  // The left side batch.
   ExecBatch batch_left;
   {
     std::vector<Datum> values;
 
     // Several fixed length arrays containing random content.
     for (const auto& type : fixed_length_types) {
-      ASSERT_OK_AND_ASSIGN(auto value, ::arrow::gen::Random(type)->Generate(num_rows));
+      ASSERT_OK_AND_ASSIGN(auto value, Random(type)->Generate(num_rows));
       values.push_back(std::move(value));
     }
     // A var length array containing 'X' repeated var_length times.
-    ASSERT_OK_AND_ASSIGN(auto value_var_length,
-                         ::arrow::gen::Constant(
-                             std::make_shared<BinaryScalar>(std::string(var_length, 'X')))
-                             ->Generate(num_rows));
+    ASSERT_OK_AND_ASSIGN(
+        auto value_var_length,
+        Constant(std::make_shared<BinaryScalar>(std::string(var_length, 'X')))
+            ->Generate(num_rows));
     values.push_back(std::move(value_var_length));
 
     batch_left = ExecBatch(std::move(values), num_rows);
-    ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left));
   }
 
+  // The left side columns.
+  std::vector<KeyColumnArray> columns_left;
+  ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left));
+
   // The right side row table.
-  RowTableImpl row_table_right;
-  {
-    // Encode the row table with the left columns.
-    std::vector<KeyColumnMetadata> column_metadatas;
-    ASSERT_OK(ColumnMetadatasFromExecBatch(batch_left, &column_metadatas));
-    RowTableMetadata table_metadata;
-    table_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t),
-                                            sizeof(uint64_t));
-    ASSERT_OK(row_table_right.Init(pool, table_metadata));
-    std::vector<uint16_t> row_ids(num_rows);
-    std::iota(row_ids.begin(), row_ids.end(), 0);
-    RowTableEncoder row_encoder;
-    row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t));
-    row_encoder.PrepareEncodeSelected(0, num_rows, columns_left);
-    ASSERT_OK(row_encoder.EncodeSelected(
-        &row_table_right, static_cast<uint32_t>(num_rows), row_ids.data()));
-
-    // The row table must contain an offset buffer.
-    ASSERT_NE(row_table_right.offsets(), NULLPTR);
-    // The whole point of this test.
-    ASSERT_GT(row_table_right.offsets()[num_rows - 1], two_gb);
-  }
+  ASSERT_OK_AND_ASSIGN(RowTableImpl row_table_right,
+                       MakeRowTableFromExecBatch(batch_left));
+  // The row table must contain an offset buffer.
+  ASSERT_NE(row_table_right.data(2), NULLPTR);
+  // The whole point of this test.
+  ASSERT_GT(row_table_right.offsets()[num_rows - 1], k2GB);
 
   // The rows to compare.
   std::vector<uint32_t> row_ids_to_compare(num_rows);
   std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(), 0);
 
-  TempVectorStack stack;
-  ASSERT_OK(stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows)));
-  LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack};
+  AssertCompareColumnsToRowsAllMatch(columns_left, row_table_right, row_ids_to_compare);
+}
 
-  {
-    // No selection, output no match row ids.
-    uint32_t num_rows_no_match;
-    std::vector<uint16_t> row_ids_out(num_rows);
-    KeyCompare::CompareColumnsToRows(num_rows, /*sel_left_maybe_null=*/NULLPTR,
-                                     row_ids_to_compare.data(), &ctx, &num_rows_no_match,
-                                     row_ids_out.data(), columns_left, row_table_right,
-                                     /*are_cols_in_encoding_order=*/true,
-                                     /*out_match_bitvector_maybe_null=*/NULLPTR);
-    ASSERT_EQ(num_rows_no_match, 0);
+// GH-43495: Compare fixed length columns to rows over 4GB within a row table.
+TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBFixedLength)) {
+  if constexpr (sizeof(void*) == 4) {
+    GTEST_SKIP() << "Test only works on 64-bit platforms";
   }
 
+  // The idea of this case is to create a row table using one fixed length column (so the
+  // row is hence fixed length), with more than 4GB data. Then compare the rows located at
+  // over 4GB.
+
+  // A small batch to append to the row table repeatedly to grow the row table to big
+  // enough.
+  constexpr int64_t num_rows_batch = std::numeric_limits<uint16_t>::max();
+  constexpr int fixed_length = 256;
+
+  // The size of the row table is one batch larger than 4GB, and we'll compare the last
+  // num_rows_batch rows.
+  constexpr int64_t k4GB = 4ll * 1024 * 1024 * 1024;
+  constexpr int64_t num_rows_row_table =
+      (k4GB / (fixed_length * num_rows_batch) + 1) * num_rows_batch;
+  static_assert(num_rows_row_table < std::numeric_limits<uint32_t>::max(),
+                "row table length must be less than uint32 max");
+  static_assert(num_rows_row_table * fixed_length > k4GB,
+                "row table size must be greater than 4GB");
+
+  // The left side batch with num_rows_batch rows.
+  ExecBatch batch_left;
   {
-    // No selection, output match bit vector.
-    std::vector<uint8_t> match_bitvector(BytesForBits(num_rows));
-    KeyCompare::CompareColumnsToRows(
-        num_rows, /*sel_left_maybe_null=*/NULLPTR, row_ids_to_compare.data(), &ctx,
-        /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left,
-        row_table_right,
-        /*are_cols_in_encoding_order=*/true, match_bitvector.data());
-    ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, num_rows),
-              num_rows);
+    std::vector<Datum> values;
+
+    // A fixed length array containing random values.
+    ASSERT_OK_AND_ASSIGN(
+        auto value_fixed_length,
+        Random(fixed_size_binary(fixed_length))->Generate(num_rows_batch));
+    values.push_back(std::move(value_fixed_length));
+
+    batch_left = ExecBatch(std::move(values), num_rows_batch);
   }
 
-  std::vector<uint16_t> selection_left(num_rows);
-  std::iota(selection_left.begin(), selection_left.end(), 0);
+  // The left side columns with num_rows_batch rows.
+  std::vector<KeyColumnArray> columns_left;
+  ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left));
+
+  // The right side row table with num_rows_row_table rows.
+  ASSERT_OK_AND_ASSIGN(
+      RowTableImpl row_table_right,
+      RepeatRowTableUntil(MakeRowTableFromExecBatch(batch_left).ValueUnsafe(),
+                          num_rows_row_table));
+  // The row table must not contain a third buffer.
+  ASSERT_EQ(row_table_right.data(2), NULLPTR);
+  // The row data must be greater than 4GB.
+  ASSERT_GT(row_table_right.buffer_size(1), k4GB);
+
+  // The rows to compare: the last num_rows_batch rows in the row table VS. the whole
+  // batch.
+  std::vector<uint32_t> row_ids_to_compare(num_rows_batch);
+  std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(),
+            static_cast<uint32_t>(num_rows_row_table - num_rows_batch));
+
+  AssertCompareColumnsToRowsAllMatch(columns_left, row_table_right, row_ids_to_compare);
+}
 
-  {
-    // With selection, output no match row ids.
-    uint32_t num_rows_no_match;
-    std::vector<uint16_t> row_ids_out(num_rows);
-    KeyCompare::CompareColumnsToRows(num_rows, selection_left.data(),
-                                     row_ids_to_compare.data(), &ctx, &num_rows_no_match,
-                                     row_ids_out.data(), columns_left, row_table_right,
-                                     /*are_cols_in_encoding_order=*/true,
-                                     /*out_match_bitvector_maybe_null=*/NULLPTR);
-    ASSERT_EQ(num_rows_no_match, 0);
+// GH-43495: Compare var length columns to rows at offset over 4GB within a row table.
+TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBVarLength)) {
+  if constexpr (sizeof(void*) == 4) {
+    GTEST_SKIP() << "Test only works on 64-bit platforms";
   }
 
+  // The idea of this case is to create a row table using one fixed length column and one
+  // var length column (so the row is hence var length and has offset buffer), with more
+  // than 4GB data. Then compare the rows located at over 4GB.
+
+  // A small batch to append to the row table repeatedly to grow the row table to big
+  // enough.
+  constexpr int64_t num_rows_batch = std::numeric_limits<uint16_t>::max();
+  constexpr int fixed_length = 128;
+  // Involve some small randomness in the var length column.
+  constexpr int var_length_min = 128;
+  constexpr int var_length_max = 129;
+  constexpr double null_probability = 0.01;
+
+  // The size of the row table is one batch larger than 4GB, and we'll compare the last
+  // num_rows_batch rows.
+  constexpr int64_t k4GB = 4ll * 1024 * 1024 * 1024;
+  constexpr int64_t size_row_min = fixed_length + var_length_min;
+  constexpr int64_t num_rows_row_table =
+      (k4GB / (size_row_min * num_rows_batch) + 1) * num_rows_batch;
+  static_assert(num_rows_row_table < std::numeric_limits<uint32_t>::max(),
+                "row table length must be less than uint32 max");
+  static_assert(num_rows_row_table * size_row_min > k4GB,
+                "row table size must be greater than 4GB");
+
+  // The left side batch with num_rows_batch rows.
+  ExecBatch batch_left;
   {
-    // With selection, output match bit vector.
-    std::vector<uint8_t> match_bitvector(BytesForBits(num_rows));
-    KeyCompare::CompareColumnsToRows(
-        num_rows, selection_left.data(), row_ids_to_compare.data(), &ctx,
-        /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left,
-        row_table_right,
-        /*are_cols_in_encoding_order=*/true, match_bitvector.data());
-    ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, num_rows),
-              num_rows);
+    std::vector<Datum> values;
+
+    // A fixed length array containing random values.
+    ASSERT_OK_AND_ASSIGN(
+        auto value_fixed_length,
+        Random(fixed_size_binary(fixed_length))->Generate(num_rows_batch));
+    values.push_back(std::move(value_fixed_length));
+
+    // A var length array containing random binary of 128 or 129 bytes with small portion
+    // of nulls.
+    auto value_var_length = RandomArrayGenerator(kSeedMax).String(
+        num_rows_batch, var_length_min, var_length_max, null_probability);
+    values.push_back(std::move(value_var_length));
+
+    batch_left = ExecBatch(std::move(values), num_rows_batch);
   }
+
+  // The left side columns with num_rows_batch rows.
+  std::vector<KeyColumnArray> columns_left;
+  ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left));
+
+  // The right side row table with num_rows_row_table rows.
+  ASSERT_OK_AND_ASSIGN(
+      RowTableImpl row_table_right,
+      RepeatRowTableUntil(MakeRowTableFromExecBatch(batch_left).ValueUnsafe(),
+                          num_rows_row_table));
+  // The row table must contain an offset buffer.
+  ASSERT_NE(row_table_right.data(2), NULLPTR);
+  // At least the last row should be located at over 4GB.
+  ASSERT_GT(row_table_right.offsets()[num_rows_row_table - 1], k4GB);
+
+  // The rows to compare: the last num_rows_batch rows in the row table VS. the whole
+  // batch.
+  std::vector<uint32_t> row_ids_to_compare(num_rows_batch);
+  std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(),
+            static_cast<uint32_t>(num_rows_row_table - num_rows_batch));
+
+  AssertCompareColumnsToRowsAllMatch(columns_left, row_table_right, row_ids_to_compare);
 }
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc
index 658e0dffcac..127d43021d6 100644
--- a/cpp/src/arrow/compute/row/encode_internal.cc
+++ b/cpp/src/arrow/compute/row/encode_internal.cc
@@ -17,7 +17,6 @@
 
 #include "arrow/compute/row/encode_internal.h"
 #include "arrow/util/checked_cast.h"
-#include "arrow/util/int_util_overflow.h"
 
 namespace arrow {
 namespace compute {
@@ -265,7 +264,8 @@ void EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows,
            num_rows * row_size);
   } else if (rows.metadata().is_fixed_length) {
     uint32_t row_size = rows.metadata().fixed_length;
-    const uint8_t* row_base = rows.data(1) + start_row * row_size;
+    const uint8_t* row_base =
+        rows.data(1) + static_cast<RowTableImpl::offset_type>(start_row) * row_size;
     row_base += offset_within_row;
     uint8_t* col_base = col_prep.mutable_data(1);
     switch (col_prep.metadata().fixed_length) {
@@ -296,7 +296,7 @@ void EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows,
         DCHECK(false);
     }
   } else {
-    const uint32_t* row_offsets = rows.offsets() + start_row;
+    const RowTableImpl::offset_type* row_offsets = rows.offsets() + start_row;
     const uint8_t* row_base = rows.data(2);
     row_base += offset_within_row;
     uint8_t* col_base = col_prep.mutable_data(1);
@@ -362,14 +362,14 @@ void EncoderBinary::EncodeSelectedImp(uint32_t offset_within_row, RowTableImpl*
   } else {
     const uint8_t* src_base = col.data(1);
     uint8_t* dst = rows->mutable_data(2) + offset_within_row;
-    const uint32_t* offsets = rows->offsets();
+    const RowTableImpl::offset_type* offsets = rows->offsets();
     for (uint32_t i = 0; i < num_selected; ++i) {
       copy_fn(dst + offsets[i], src_base, selection[i]);
     }
     if (col.data(0)) {
       const uint8_t* non_null_bits = col.data(0);
       uint8_t* dst = rows->mutable_data(2) + offset_within_row;
-      const uint32_t* offsets = rows->offsets();
+      const RowTableImpl::offset_type* offsets = rows->offsets();
       for (uint32_t i = 0; i < num_selected; ++i) {
         bool is_null = !bit_util::GetBit(non_null_bits, selection[i] + col.bit_offset(0));
         if (is_null) {
@@ -585,10 +585,12 @@ void EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row,
   uint8_t* dst_B = col2->mutable_data(1);
 
   uint32_t fixed_length = rows.metadata().fixed_length;
-  const uint32_t* offsets;
+  const RowTableImpl::offset_type* offsets;
   const uint8_t* src_base;
   if (is_row_fixed_length) {
-    src_base = rows.data(1) + fixed_length * start_row + offset_within_row;
+    src_base = rows.data(1) +
+               static_cast<RowTableImpl::offset_type>(start_row) * fixed_length +
+               offset_within_row;
     offsets = nullptr;
   } else {
     src_base = rows.data(2) + offset_within_row;
@@ -640,7 +642,7 @@ void EncoderOffsets::Decode(uint32_t start_row, uint32_t num_rows,
   // The Nth element is the sum of all the lengths of varbinary columns data in
   // that row, up to and including Nth varbinary column.
 
-  const uint32_t* row_offsets = rows.offsets() + start_row;
+  const RowTableImpl::offset_type* row_offsets = rows.offsets() + start_row;
 
   // Set the base offset for each column
   for (size_t col = 0; col < varbinary_cols->size(); ++col) {
@@ -658,8 +660,8 @@ void EncoderOffsets::Decode(uint32_t start_row, uint32_t num_rows,
     // Update the offset of each column
     uint32_t offset_within_row = rows.metadata().fixed_length;
     for (size_t col = 0; col < varbinary_cols->size(); ++col) {
-      offset_within_row +=
-          RowTableMetadata::padding_for_alignment(offset_within_row, string_alignment);
+      offset_within_row += RowTableMetadata::padding_for_alignment_within_row(
+          offset_within_row, string_alignment);
       uint32_t length = varbinary_ends[col] - offset_within_row;
       offset_within_row = varbinary_ends[col];
       uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets();
@@ -676,7 +678,7 @@ Status EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows,
     return Status::OK();
   }
 
-  uint32_t* row_offsets = rows->mutable_offsets();
+  RowTableImpl::offset_type* row_offsets = rows->mutable_offsets();
   for (uint32_t i = 0; i < num_selected; ++i) {
     row_offsets[i] = rows->metadata().fixed_length;
   }
@@ -688,7 +690,7 @@ Status EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows,
       for (uint32_t i = 0; i < num_selected; ++i) {
         uint32_t irow = selection[i];
         uint32_t length = col_offsets[irow + 1] - col_offsets[irow];
-        row_offsets[i] += RowTableMetadata::padding_for_alignment(
+        row_offsets[i] += RowTableMetadata::padding_for_alignment_row(
             row_offsets[i], rows->metadata().string_alignment);
         row_offsets[i] += length;
       }
@@ -708,20 +710,13 @@ Status EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows,
     }
   }
 
-  uint32_t sum = 0;
+  int64_t sum = 0;
   int row_alignment = rows->metadata().row_alignment;
   for (uint32_t i = 0; i < num_selected; ++i) {
-    uint32_t length = row_offsets[i];
-    length += RowTableMetadata::padding_for_alignment(length, row_alignment);
+    RowTableImpl::offset_type length = row_offsets[i];
+    length += RowTableMetadata::padding_for_alignment_row(length, row_alignment);
     row_offsets[i] = sum;
-    uint32_t sum_maybe_overflow = 0;
-    if (ARROW_PREDICT_FALSE(
-            arrow::internal::AddWithOverflow(sum, length, &sum_maybe_overflow))) {
-      return Status::Invalid(
-          "Offset overflow detected in EncoderOffsets::GetRowOffsetsSelected for row ", i,
-          " of length ", length, " bytes, current length in total is ", sum, " bytes");
-    }
-    sum = sum_maybe_overflow;
+    sum += length;
   }
   row_offsets[num_selected] = sum;
 
@@ -732,7 +727,7 @@ template <bool has_nulls, bool is_first_varbinary>
 void EncoderOffsets::EncodeSelectedImp(uint32_t ivarbinary, RowTableImpl* rows,
                                        const std::vector<KeyColumnArray>& cols,
                                        uint32_t num_selected, const uint16_t* selection) {
-  const uint32_t* row_offsets = rows->offsets();
+  const RowTableImpl::offset_type* row_offsets = rows->offsets();
   uint8_t* row_base = rows->mutable_data(2) +
                       rows->metadata().varbinary_end_array_offset +
                       ivarbinary * sizeof(uint32_t);
@@ -753,7 +748,7 @@ void EncoderOffsets::EncodeSelectedImp(uint32_t ivarbinary, RowTableImpl* rows,
       row[0] = rows->metadata().fixed_length + length;
     } else {
       row[0] = row[-1] +
-               RowTableMetadata::padding_for_alignment(
+               RowTableMetadata::padding_for_alignment_within_row(
                    row[-1], rows->metadata().string_alignment) +
                length;
     }
@@ -857,7 +852,7 @@ void EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows, const RowTableI
 void EncoderVarBinary::EncodeSelected(uint32_t ivarbinary, RowTableImpl* rows,
                                       const KeyColumnArray& cols, uint32_t num_selected,
                                       const uint16_t* selection) {
-  const uint32_t* row_offsets = rows->offsets();
+  const RowTableImpl::offset_type* row_offsets = rows->offsets();
   uint8_t* row_base = rows->mutable_data(2);
   const uint32_t* col_offsets = cols.offsets();
   const uint8_t* col_base = cols.data(2);
diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h
index 0618ddd8e4b..37538fcc4b8 100644
--- a/cpp/src/arrow/compute/row/encode_internal.h
+++ b/cpp/src/arrow/compute/row/encode_internal.h
@@ -173,7 +173,7 @@ class EncoderBinary {
         copy_fn(dst, src, col_width);
       }
     } else {
-      const uint32_t* row_offsets = rows_const->offsets();
+      const RowTableImpl::offset_type* row_offsets = rows_const->offsets();
       for (uint32_t i = 0; i < num_rows; ++i) {
         const uint8_t* src;
         uint8_t* dst;
@@ -267,7 +267,8 @@ class EncoderVarBinary {
     ARROW_DCHECK(!rows_const->metadata().is_fixed_length &&
                  !col_const->metadata().is_fixed_length);
 
-    const uint32_t* row_offsets_for_batch = rows_const->offsets() + start_row;
+    const RowTableImpl::offset_type* row_offsets_for_batch =
+        rows_const->offsets() + start_row;
     const uint32_t* col_offsets = col_const->offsets();
 
     uint32_t col_offset_next = col_offsets[0];
@@ -275,7 +276,7 @@ class EncoderVarBinary {
       uint32_t col_offset = col_offset_next;
       col_offset_next = col_offsets[i + 1];
 
-      uint32_t row_offset = row_offsets_for_batch[i];
+      RowTableImpl::offset_type row_offset = row_offsets_for_batch[i];
       const uint8_t* row = rows_const->data(2) + row_offset;
 
       uint32_t offset_within_row;
diff --git a/cpp/src/arrow/compute/row/encode_internal_avx2.cc b/cpp/src/arrow/compute/row/encode_internal_avx2.cc
index 50969c7bd60..26f8e3a63de 100644
--- a/cpp/src/arrow/compute/row/encode_internal_avx2.cc
+++ b/cpp/src/arrow/compute/row/encode_internal_avx2.cc
@@ -75,10 +75,12 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows
   uint8_t* col_vals_B = col2->mutable_data(1);
 
   uint32_t fixed_length = rows.metadata().fixed_length;
-  const uint32_t* offsets;
+  const RowTableImpl::offset_type* offsets;
   const uint8_t* src_base;
   if (is_row_fixed_length) {
-    src_base = rows.data(1) + fixed_length * start_row + offset_within_row;
+    src_base = rows.data(1) +
+               static_cast<RowTableImpl::offset_type>(fixed_length) * start_row +
+               offset_within_row;
     offsets = nullptr;
   } else {
     src_base = rows.data(2) + offset_within_row;
@@ -99,7 +101,7 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows
         src2 = reinterpret_cast<const __m128i*>(src + fixed_length * 2);
         src3 = reinterpret_cast<const __m128i*>(src + fixed_length * 3);
       } else {
-        const uint32_t* row_offsets = offsets + i * unroll;
+        const RowTableImpl::offset_type* row_offsets = offsets + i * unroll;
         const uint8_t* src = src_base;
         src0 = reinterpret_cast<const __m128i*>(src + row_offsets[0]);
         src1 = reinterpret_cast<const __m128i*>(src + row_offsets[1]);
@@ -140,7 +142,7 @@ uint32_t EncoderBinaryPair::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows
           }
         }
       } else {
-        const uint32_t* row_offsets = offsets + i * unroll;
+        const RowTableImpl::offset_type* row_offsets = offsets + i * unroll;
         const uint8_t* src = src_base;
         for (int j = 0; j < unroll; ++j) {
           if (col_width == 1) {
diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc
index 746ed950ffa..aa7e62add45 100644
--- a/cpp/src/arrow/compute/row/row_internal.cc
+++ b/cpp/src/arrow/compute/row/row_internal.cc
@@ -18,7 +18,6 @@
 #include "arrow/compute/row/row_internal.h"
 
 #include "arrow/compute/util.h"
-#include "arrow/util/int_util_overflow.h"
 
 namespace arrow {
 namespace compute {
@@ -128,8 +127,8 @@ void RowTableMetadata::FromColumnMetadataVector(
     const KeyColumnMetadata& col = cols[column_order[i]];
     if (col.is_fixed_length && col.fixed_length != 0 &&
         ARROW_POPCOUNT64(col.fixed_length) != 1) {
-      offset_within_row += RowTableMetadata::padding_for_alignment(offset_within_row,
-                                                                   string_alignment, col);
+      offset_within_row += RowTableMetadata::padding_for_alignment_within_row(
+          offset_within_row, string_alignment, col);
     }
     column_offsets[i] = offset_within_row;
     if (!col.is_fixed_length) {
@@ -155,7 +154,7 @@ void RowTableMetadata::FromColumnMetadataVector(
   is_fixed_length = (num_varbinary_cols == 0);
   fixed_length =
       offset_within_row +
-      RowTableMetadata::padding_for_alignment(
+      RowTableMetadata::padding_for_alignment_within_row(
           offset_within_row, num_varbinary_cols == 0 ? row_alignment : string_alignment);
 
   // We set the number of bytes per row storing null masks of individual key columns
@@ -191,7 +190,7 @@ Status RowTableImpl::Init(MemoryPool* pool, const RowTableMetadata& metadata) {
         auto offsets, AllocateResizableBuffer(size_offsets(kInitialRowsCapacity), pool_));
     offsets_ = std::move(offsets);
     memset(offsets_->mutable_data(), 0, size_offsets(kInitialRowsCapacity));
-    reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
+    reinterpret_cast<offset_type*>(offsets_->mutable_data())[0] = 0;
 
     ARROW_ASSIGN_OR_RAISE(
         auto rows,
@@ -226,7 +225,7 @@ void RowTableImpl::Clean() {
   has_any_nulls_ = false;
 
   if (!metadata_.is_fixed_length) {
-    reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
+    reinterpret_cast<offset_type*>(offsets_->mutable_data())[0] = 0;
   }
 }
 
@@ -235,7 +234,7 @@ int64_t RowTableImpl::size_null_masks(int64_t num_rows) const {
 }
 
 int64_t RowTableImpl::size_offsets(int64_t num_rows) const {
-  return (num_rows + 1) * sizeof(uint32_t) + kPaddingForVectors;
+  return (num_rows + 1) * sizeof(offset_type) + kPaddingForVectors;
 }
 
 int64_t RowTableImpl::size_rows_fixed_length(int64_t num_rows) const {
@@ -326,23 +325,15 @@ Status RowTableImpl::AppendSelectionFrom(const RowTableImpl& from,
 
   if (!metadata_.is_fixed_length) {
     // Varying-length rows
-    auto from_offsets = reinterpret_cast<const uint32_t*>(from.offsets_->data());
-    auto to_offsets = reinterpret_cast<uint32_t*>(offsets_->mutable_data());
-    uint32_t total_length = to_offsets[num_rows_];
-    uint32_t total_length_to_append = 0;
+    auto from_offsets = reinterpret_cast<const offset_type*>(from.offsets_->data());
+    auto to_offsets = reinterpret_cast<offset_type*>(offsets_->mutable_data());
+    offset_type total_length = to_offsets[num_rows_];
+    int64_t total_length_to_append = 0;
     for (uint32_t i = 0; i < num_rows_to_append; ++i) {
       uint16_t row_id = source_row_ids ? source_row_ids[i] : i;
-      uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
+      int64_t length = from_offsets[row_id + 1] - from_offsets[row_id];
       total_length_to_append += length;
-      uint32_t to_offset_maybe_overflow = 0;
-      if (ARROW_PREDICT_FALSE(arrow::internal::AddWithOverflow(
-              total_length, total_length_to_append, &to_offset_maybe_overflow))) {
-        return Status::Invalid(
-            "Offset overflow detected in RowTableImpl::AppendSelectionFrom for row ",
-            num_rows_ + i, " of length ", length, " bytes, current length in total is ",
-            to_offsets[num_rows_ + i], " bytes");
-      }
-      to_offsets[num_rows_ + i + 1] = to_offset_maybe_overflow;
+      to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append;
     }
 
     RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(total_length_to_append));
@@ -351,7 +342,8 @@ Status RowTableImpl::AppendSelectionFrom(const RowTableImpl& from,
     uint8_t* dst = rows_->mutable_data() + total_length;
     for (uint32_t i = 0; i < num_rows_to_append; ++i) {
       uint16_t row_id = source_row_ids ? source_row_ids[i] : i;
-      uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
+      int64_t length = from_offsets[row_id + 1] - from_offsets[row_id];
+      DCHECK_LE(length, std::numeric_limits<uint32_t>::max());
       auto src64 = reinterpret_cast<const uint64_t*>(src + from_offsets[row_id]);
       auto dst64 = reinterpret_cast<uint64_t*>(dst);
       for (uint32_t j = 0; j < bit_util::CeilDiv(length, 8); ++j) {
@@ -397,7 +389,7 @@ Status RowTableImpl::AppendSelectionFrom(const RowTableImpl& from,
 }
 
 Status RowTableImpl::AppendEmpty(uint32_t num_rows_to_append,
-                                 uint32_t num_extra_bytes_to_append) {
+                                 int64_t num_extra_bytes_to_append) {
   RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append));
   if (!metadata_.is_fixed_length) {
     RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append));
diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h
index 93818fb14d6..094a9c31efe 100644
--- a/cpp/src/arrow/compute/row/row_internal.h
+++ b/cpp/src/arrow/compute/row/row_internal.h
@@ -30,6 +30,8 @@ namespace compute {
 
 /// Description of the data stored in a RowTable
 struct ARROW_EXPORT RowTableMetadata {
+  using offset_type = int64_t;
+
   /// \brief True if there are no variable length columns in the table
   bool is_fixed_length;
 
@@ -78,26 +80,35 @@ struct ARROW_EXPORT RowTableMetadata {
   /// Offsets within a row to fields in their encoding order.
   std::vector<uint32_t> column_offsets;
 
-  /// Rounding up offset to the nearest multiple of alignment value.
+  /// Rounding up offset within row to the nearest multiple of alignment value.
   /// Alignment must be a power of 2.
-  static inline uint32_t padding_for_alignment(uint32_t offset, int required_alignment) {
+  static inline uint32_t padding_for_alignment_within_row(uint32_t offset,
+                                                          int required_alignment) {
     ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
     return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
                                  (required_alignment - 1));
   }
 
-  /// Rounding up offset to the beginning of next column,
+  /// Rounding up offset within row to the beginning of next column,
   /// choosing required alignment based on the data type of that column.
-  static inline uint32_t padding_for_alignment(uint32_t offset, int string_alignment,
-                                               const KeyColumnMetadata& col_metadata) {
+  static inline uint32_t padding_for_alignment_within_row(
+      uint32_t offset, int string_alignment, const KeyColumnMetadata& col_metadata) {
     if (!col_metadata.is_fixed_length ||
         ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
       return 0;
     } else {
-      return padding_for_alignment(offset, string_alignment);
+      return padding_for_alignment_within_row(offset, string_alignment);
     }
   }
 
+  /// Rounding up row offset to the nearest multiple of alignment value.
+  /// Alignment must be a power of 2.
+  static inline offset_type padding_for_alignment_row(offset_type row_offset,
+                                                      int required_alignment) {
+    ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
+    return (-row_offset) & (required_alignment - 1);
+  }
+
   /// Returns an array of offsets within a row of ends of varbinary fields.
   inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
     ARROW_DCHECK(!is_fixed_length);
@@ -127,7 +138,7 @@ struct ARROW_EXPORT RowTableMetadata {
     ARROW_DCHECK(varbinary_id > 0);
     const uint32_t* varbinary_end = varbinary_end_array(row);
     uint32_t offset = varbinary_end[varbinary_id - 1];
-    offset += padding_for_alignment(offset, string_alignment);
+    offset += padding_for_alignment_within_row(offset, string_alignment);
     *out_offset = offset;
     *out_length = varbinary_end[varbinary_id] - offset;
   }
@@ -161,6 +172,8 @@ struct ARROW_EXPORT RowTableMetadata {
 /// The row table is not safe
 class ARROW_EXPORT RowTableImpl {
  public:
+  using offset_type = RowTableMetadata::offset_type;
+
   RowTableImpl();
   /// \brief Initialize a row array for use
   ///
@@ -175,7 +188,7 @@ class ARROW_EXPORT RowTableImpl {
   /// \param num_extra_bytes_to_append For tables storing variable-length data this
   ///     should be a guess of how many data bytes will be needed to populate the
   ///     data.  This is ignored if there are no variable-length columns
-  Status AppendEmpty(uint32_t num_rows_to_append, uint32_t num_extra_bytes_to_append);
+  Status AppendEmpty(uint32_t num_rows_to_append, int64_t num_extra_bytes_to_append);
   /// \brief Append rows from a source table
   /// \param from The table to append from
   /// \param num_rows_to_append The number of rows to append
@@ -201,8 +214,12 @@ class ARROW_EXPORT RowTableImpl {
     }
     return NULLPTR;
   }
-  const uint32_t* offsets() const { return reinterpret_cast<const uint32_t*>(data(1)); }
-  uint32_t* mutable_offsets() { return reinterpret_cast<uint32_t*>(mutable_data(1)); }
+  const offset_type* offsets() const {
+    return reinterpret_cast<const offset_type*>(data(1));
+  }
+  offset_type* mutable_offsets() {
+    return reinterpret_cast<offset_type*>(mutable_data(1));
+  }
   const uint8_t* null_masks() const { return null_masks_->data(); }
   uint8_t* null_masks() { return null_masks_->mutable_data(); }
 
diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc
index 75f981fb128..6aed9e43278 100644
--- a/cpp/src/arrow/compute/row/row_test.cc
+++ b/cpp/src/arrow/compute/row/row_test.cc
@@ -123,7 +123,7 @@ TEST(RowTableMemoryConsumption, Encode) {
       ASSERT_GT(actual_null_mask_size * 2,
                 row_table.buffer_size(0) - padding_for_vectors);
 
-      int64_t actual_offset_size = num_rows * sizeof(uint32_t);
+      int64_t actual_offset_size = num_rows * sizeof(RowTableImpl::offset_type);
       ASSERT_LE(actual_offset_size, row_table.buffer_size(1) - padding_for_vectors);
       ASSERT_GT(actual_offset_size * 2, row_table.buffer_size(1) - padding_for_vectors);
 
@@ -134,15 +134,14 @@ TEST(RowTableMemoryConsumption, Encode) {
   }
 }
 
-// GH-43202: Ensure that when offset overflow happens in encoding the row table, an
-// explicit error is raised instead of a silent wrong result.
-TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(Encode)) {
+// GH-43495: Ensure that we can build a row table with more than 4GB row data.
+TEST(RowTableLarge, LARGE_MEMORY_TEST(Encode)) {
   if constexpr (sizeof(void*) == 4) {
     GTEST_SKIP() << "Test only works on 64-bit platforms";
   }
 
-  // Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the row table.
-  constexpr int64_t num_rows = 8;
+  // Use 9 512MB var-length rows to occupy more than 4GB memory.
+  constexpr int64_t num_rows = 9;
   constexpr int64_t length_per_binary = 512 * 1024 * 1024;
   constexpr int64_t row_alignment = sizeof(uint32_t);
   constexpr int64_t var_length_alignment = sizeof(uint32_t);
@@ -174,39 +173,24 @@ TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(Encode)) {
   // The rows to encode.
   std::vector<uint16_t> row_ids(num_rows, 0);
 
-  // Encoding 7 rows should be fine.
-  {
-    row_encoder.PrepareEncodeSelected(0, num_rows - 1, columns);
-    ASSERT_OK(row_encoder.EncodeSelected(&row_table, static_cast<uint32_t>(num_rows - 1),
-                                         row_ids.data()));
-  }
+  // Encode num_rows rows.
+  row_encoder.PrepareEncodeSelected(0, num_rows, columns);
+  ASSERT_OK(row_encoder.EncodeSelected(&row_table, static_cast<uint32_t>(num_rows),
+                                       row_ids.data()));
 
-  // Encoding 8 rows should overflow.
-  {
-    int64_t length_per_row = table_metadata.fixed_length + length_per_binary;
-    std::stringstream expected_error_message;
-    expected_error_message << "Invalid: Offset overflow detected in "
-                              "EncoderOffsets::GetRowOffsetsSelected for row "
-                           << num_rows - 1 << " of length " << length_per_row
-                           << " bytes, current length in total is "
-                           << length_per_row * (num_rows - 1) << " bytes";
-    row_encoder.PrepareEncodeSelected(0, num_rows, columns);
-    ASSERT_RAISES_WITH_MESSAGE(
-        Invalid, expected_error_message.str(),
-        row_encoder.EncodeSelected(&row_table, static_cast<uint32_t>(num_rows),
-                                   row_ids.data()));
-  }
+  auto encoded_row_length = table_metadata.fixed_length + length_per_binary;
+  ASSERT_EQ(row_table.offsets()[num_rows - 1], encoded_row_length * (num_rows - 1));
+  ASSERT_EQ(row_table.offsets()[num_rows], encoded_row_length * num_rows);
 }
 
-// GH-43202: Ensure that when offset overflow happens in appending to the row table, an
-// explicit error is raised instead of a silent wrong result.
-TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(AppendFrom)) {
+// GH-43495: Ensure that we can build a row table with more than 4GB row data.
+TEST(RowTableLarge, LARGE_MEMORY_TEST(AppendFrom)) {
   if constexpr (sizeof(void*) == 4) {
     GTEST_SKIP() << "Test only works on 64-bit platforms";
   }
 
-  // Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the row table.
-  constexpr int64_t num_rows = 8;
+  // Use 9 512MB var-length rows to occupy more than 4GB memory.
+  constexpr int64_t num_rows = 9;
   constexpr int64_t length_per_binary = 512 * 1024 * 1024;
   constexpr int64_t num_rows_seed = 1;
   constexpr int64_t row_alignment = sizeof(uint32_t);
@@ -244,23 +228,15 @@ TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(AppendFrom)) {
   RowTableImpl row_table;
   ASSERT_OK(row_table.Init(pool, table_metadata));
 
-  // Appending the seed 7 times should be fine.
-  for (int i = 0; i < num_rows - 1; ++i) {
+  // Append seed num_rows times.
+  for (int i = 0; i < num_rows; ++i) {
     ASSERT_OK(row_table.AppendSelectionFrom(row_table_seed, num_rows_seed,
                                             /*source_row_ids=*/NULLPTR));
   }
 
-  // Appending the seed the 8-th time should overflow.
-  int64_t length_per_row = table_metadata.fixed_length + length_per_binary;
-  std::stringstream expected_error_message;
-  expected_error_message
-      << "Invalid: Offset overflow detected in RowTableImpl::AppendSelectionFrom for row "
-      << num_rows - 1 << " of length " << length_per_row
-      << " bytes, current length in total is " << length_per_row * (num_rows - 1)
-      << " bytes";
-  ASSERT_RAISES_WITH_MESSAGE(Invalid, expected_error_message.str(),
-                             row_table.AppendSelectionFrom(row_table_seed, num_rows_seed,
-                                                           /*source_row_ids=*/NULLPTR));
+  auto encoded_row_length = table_metadata.fixed_length + length_per_binary;
+  ASSERT_EQ(row_table.offsets()[num_rows - 1], encoded_row_length * (num_rows - 1));
+  ASSERT_EQ(row_table.offsets()[num_rows], encoded_row_length * num_rows);
 }
 
 }  // namespace compute
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index c317fe7aef4..59de09fff83 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -473,19 +473,16 @@ std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(
   return result;
 }
 
-std::shared_ptr<Array> RandomArrayGenerator::FixedSizeBinary(int64_t size,
-                                                             int32_t byte_width,
-                                                             double null_probability,
-                                                             int64_t alignment,
-                                                             MemoryPool* memory_pool) {
+std::shared_ptr<Array> RandomArrayGenerator::FixedSizeBinary(
+    int64_t size, int32_t byte_width, double null_probability, uint8_t min_byte,
+    uint8_t max_byte, int64_t alignment, MemoryPool* memory_pool) {
   if (null_probability < 0 || null_probability > 1) {
     ABORT_NOT_OK(Status::Invalid("null_probability must be between 0 and 1"));
   }
 
   // Visual Studio does not implement uniform_int_distribution for char types.
   using GenOpt = GenerateOptions<uint8_t, std::uniform_int_distribution<uint16_t>>;
-  GenOpt options(seed(), static_cast<uint8_t>('A'), static_cast<uint8_t>('z'),
-                 null_probability);
+  GenOpt options(seed(), min_byte, max_byte, null_probability);
 
   int64_t null_count = 0;
   auto null_bitmap = *AllocateEmptyBitmap(size, alignment, memory_pool);
@@ -1087,7 +1084,9 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t
     case Type::type::FIXED_SIZE_BINARY: {
       auto byte_width =
           internal::checked_pointer_cast<FixedSizeBinaryType>(field.type())->byte_width();
-      return *FixedSizeBinary(length, byte_width, null_probability, alignment,
+      return *FixedSizeBinary(length, byte_width, null_probability,
+                              /*min_byte=*/static_cast<uint8_t>('A'),
+                              /*min_byte=*/static_cast<uint8_t>('z'), alignment,
                               memory_pool)
                   ->View(field.type());
     }
@@ -1143,7 +1142,9 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t
       // type means it's not a (useful) composition of other generators
       GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType);
     case Type::type::INTERVAL_MONTH_DAY_NANO: {
-      return *FixedSizeBinary(length, /*byte_width=*/16, null_probability, alignment,
+      return *FixedSizeBinary(length, /*byte_width=*/16, null_probability,
+                              /*min_byte=*/static_cast<uint8_t>('A'),
+                              /*min_byte=*/static_cast<uint8_t>('z'), alignment,
                               memory_pool)
                   ->View(month_day_nano_interval());
     }
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index 1d97a3ada72..9c0c5baae0f 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -434,12 +434,18 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
   /// \param[in] size the size of the array to generate
   /// \param[in] byte_width the byte width of fixed-size binary items
   /// \param[in] null_probability the probability of a value being null
+  /// \param[in] min_byte the lower bound of each byte in the binary determined by the
+  ///            uniform distribution
+  /// \param[in] max_byte the upper bound of each byte in the binary determined by the
+  ///            uniform distribution
   /// \param[in] alignment alignment for memory allocations (in bytes)
   /// \param[in] memory_pool memory pool to allocate memory from
   ///
   /// \return a generated Array
   std::shared_ptr<Array> FixedSizeBinary(int64_t size, int32_t byte_width,
                                          double null_probability = 0,
+                                         uint8_t min_byte = static_cast<uint8_t>('A'),
+                                         uint8_t max_byte = static_cast<uint8_t>('z'),
                                          int64_t alignment = kDefaultBufferAlignment,
                                          MemoryPool* memory_pool = default_memory_pool());
 

From c599fa0064a627d3b58d4eff821a34391120bcf6 Mon Sep 17 00:00:00 2001
From: Tom Scott-Coombes <62209801+tscottcoombes1@users.noreply.github.com>
Date: Mon, 19 Aug 2024 16:13:35 +0100
Subject: [PATCH 035/157] GH-43554: [Go] Handle excluded fields (#43555)

### Rationale for this change

We want to be able to handle excluded fields.

### What changes are included in this PR?

* we no longer use the value of the field when getting the element type of a list (as the values are invalid for excluded fields)
* similarly for map, key value pairs, we don't use the value is there is none
* add some tests

### Are these changes tested?

yes

### Are there any user-facing changes?

no

* GitHub Issue: #43554

Lead-authored-by: Tom Scott-Coombes <tscottcoombes@gmail.com>
Co-authored-by: Tom Scott-Coombes <62209801+tscottcoombes1@users.noreply.github.com>
Co-authored-by: Matt Topol <zotthewizard@gmail.com>
Co-authored-by: tscottcoombes1 <62209801+tscottcoombes1@users.noreply.github.com>
Co-authored-by: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Signed-off-by: Joel Lubinitsky <joellubi@gmail.com>
---
 go/arrow/util/messages/types.proto     |  46 ++
 go/arrow/util/protobuf_reflect.go      |  31 +-
 go/arrow/util/protobuf_reflect_test.go | 421 +++++++++++-----
 go/arrow/util/util_message/types.pb.go | 654 +++++++++++++++++++++++--
 4 files changed, 996 insertions(+), 156 deletions(-)

diff --git a/go/arrow/util/messages/types.proto b/go/arrow/util/messages/types.proto
index c085273ca35..79b922a22a3 100644
--- a/go/arrow/util/messages/types.proto
+++ b/go/arrow/util/messages/types.proto
@@ -54,3 +54,49 @@ message AllTheTypes {
     OPTION_1 = 1;
   }
 }
+
+message AllTheTypesNoAny {
+  string str = 1;
+  int32 int32 = 2;
+  int64 int64 = 3;
+  sint32 sint32 = 4;
+  sint64 sin64 = 5;
+  uint32 uint32 = 6;
+  uint64 uint64 = 7;
+  fixed32 fixed32 = 8;
+  fixed64 fixed64 = 9;
+  sfixed32 sfixed32 = 10;
+  bool bool = 11;
+  bytes bytes = 12;
+  double double = 13;
+  ExampleEnum enum = 14;
+  ExampleMessage message = 15;
+  oneof oneof {
+    string oneofstring = 16;
+    ExampleMessage oneofmessage = 17;
+  }
+  map<int32, string> simple_map = 19;
+  map<string, ExampleMessage> complex_map = 20;
+  repeated string simple_list = 21;
+  repeated ExampleMessage complex_list = 22;
+
+  enum ExampleEnum {
+    OPTION_0 = 0;
+    OPTION_1 = 1;
+  }
+}
+
+message SimpleNested {
+  repeated ExampleMessage simple_a = 1;
+  repeated ExampleMessage simple_b = 2;
+}
+
+message ComplexNested {
+  repeated AllTheTypesNoAny all_the_types_no_any_a = 1;
+  repeated AllTheTypesNoAny all_the_types_no_any_b = 2;
+}
+
+message DeepNested {
+  ComplexNested complex_nested = 1;
+  SimpleNested simple_nested = 2;
+}
diff --git a/go/arrow/util/protobuf_reflect.go b/go/arrow/util/protobuf_reflect.go
index 03153563b8c..c8cda96acf9 100644
--- a/go/arrow/util/protobuf_reflect.go
+++ b/go/arrow/util/protobuf_reflect.go
@@ -60,6 +60,7 @@ type ProtobufFieldReflection struct {
 	rValue     reflect.Value
 	schemaOptions
 	arrow.Field
+	isListItem bool
 }
 
 func (pfr *ProtobufFieldReflection) isNull() bool {
@@ -170,7 +171,7 @@ func (pfr *ProtobufFieldReflection) isEnum() bool {
 }
 
 func (pfr *ProtobufFieldReflection) isStruct() bool {
-	return pfr.descriptor.Kind() == protoreflect.MessageKind && !pfr.descriptor.IsMap() && pfr.rValue.Kind() != reflect.Slice
+	return pfr.descriptor.Kind() == protoreflect.MessageKind && !pfr.descriptor.IsMap() && !pfr.isList()
 }
 
 func (pfr *ProtobufFieldReflection) isMap() bool {
@@ -178,7 +179,7 @@ func (pfr *ProtobufFieldReflection) isMap() bool {
 }
 
 func (pfr *ProtobufFieldReflection) isList() bool {
-	return pfr.descriptor.IsList() && pfr.rValue.Kind() == reflect.Slice
+	return pfr.descriptor.IsList() && !pfr.isListItem
 }
 
 // ProtobufMessageReflection represents the metadata and values of a protobuf message
@@ -218,11 +219,7 @@ func (psr ProtobufMessageReflection) getArrowFields() []arrow.Field {
 	var fields []arrow.Field
 
 	for pfr := range psr.generateStructFields() {
-		fields = append(fields, arrow.Field{
-			Name:     pfr.name(),
-			Type:     pfr.getDataType(),
-			Nullable: true,
-		})
+		fields = append(fields, pfr.arrowField())
 	}
 
 	return fields
@@ -237,12 +234,10 @@ func (pfr *ProtobufFieldReflection) asList() protobufListReflection {
 }
 
 func (plr protobufListReflection) getDataType() arrow.DataType {
-	for li := range plr.generateListItems() {
-		return arrow.ListOf(li.getDataType())
-	}
 	pfr := ProtobufFieldReflection{
 		descriptor:    plr.descriptor,
 		schemaOptions: plr.schemaOptions,
+		isListItem:    true,
 	}
 	return arrow.ListOf(pfr.getDataType())
 }
@@ -401,6 +396,22 @@ func (pmr protobufMapReflection) generateKeyValuePairs() chan protobufMapKeyValu
 
 	go func() {
 		defer close(out)
+		if !pmr.rValue.IsValid() {
+			kvp := protobufMapKeyValuePairReflection{
+				k: ProtobufFieldReflection{
+					parent:        pmr.parent,
+					descriptor:    pmr.descriptor.MapKey(),
+					schemaOptions: pmr.schemaOptions,
+				},
+				v: ProtobufFieldReflection{
+					parent:        pmr.parent,
+					descriptor:    pmr.descriptor.MapValue(),
+					schemaOptions: pmr.schemaOptions,
+				},
+			}
+			out <- kvp
+			return
+		}
 		for _, k := range pmr.rValue.MapKeys() {
 			kvp := protobufMapKeyValuePairReflection{
 				k: ProtobufFieldReflection{
diff --git a/go/arrow/util/protobuf_reflect_test.go b/go/arrow/util/protobuf_reflect_test.go
index 220552df8d8..7420aa72633 100644
--- a/go/arrow/util/protobuf_reflect_test.go
+++ b/go/arrow/util/protobuf_reflect_test.go
@@ -17,9 +17,12 @@
 package util
 
 import (
-	"strings"
+	"encoding/json"
+	"fmt"
 	"testing"
 
+	"google.golang.org/protobuf/proto"
+
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
 	"github.com/apache/arrow/go/v18/arrow/memory"
@@ -30,14 +33,52 @@ import (
 	"google.golang.org/protobuf/types/known/anypb"
 )
 
-func SetupTest() util_message.AllTheTypes {
-	msg := util_message.ExampleMessage{
-		Field1: "Example",
+type Fixture struct {
+	msg     proto.Message
+	schema  string
+	jsonStr string
+}
+
+type J map[string]any
+
+func AllTheTypesFixture() Fixture {
+	e := J{"field1": "Example"}
+
+	m := J{
+		"str":          "Hello",
+		"int32":        10,
+		"int64":        100,
+		"sint32":       -10,
+		"sin64":        -100,
+		"uint32":       10,
+		"uint64":       100,
+		"fixed32":      10,
+		"fixed64":      1000,
+		"sfixed32":     10,
+		"bool":         false,
+		"bytes":        "SGVsbG8sIHdvcmxkIQ==",
+		"double":       1.1,
+		"enum":         "OPTION_1",
+		"message":      e,
+		"oneof":        []any{0, "World"},
+		"any":          J{"field1": "Example"},
+		"simple_map":   []J{{"key": 99, "value": "Hello"}},
+		"complex_map":  []J{{"key": "complex", "value": e}},
+		"simple_list":  []any{"Hello", "World"},
+		"complex_list": []J{e},
 	}
+	jm, err := json.Marshal(m)
+	if err != nil {
+		panic(err)
+	}
+	jsonString := string(jm)
 
-	anyMsg, _ := anypb.New(&msg)
+	exampleMsg := util_message.ExampleMessage{
+		Field1: "Example",
+	}
+	anyMsg, _ := anypb.New(&exampleMsg)
 
-	return util_message.AllTheTypes{
+	msg := util_message.AllTheTypes{
 		Str:      "Hello",
 		Int32:    10,
 		Int64:    100,
@@ -52,23 +93,80 @@ func SetupTest() util_message.AllTheTypes {
 		Bytes:    []byte("Hello, world!"),
 		Double:   1.1,
 		Enum:     util_message.AllTheTypes_OPTION_1,
-		Message:  &msg,
+		Message:  &exampleMsg,
 		Oneof:    &util_message.AllTheTypes_Oneofstring{Oneofstring: "World"},
 		Any:      anyMsg,
 		//Breaks the test as the Golang maps have a non-deterministic order
 		//SimpleMap:   map[int32]string{99: "Hello", 100: "World", 98: "How", 101: "Are", 1: "You"},
 		SimpleMap:   map[int32]string{99: "Hello"},
-		ComplexMap:  map[string]*util_message.ExampleMessage{"complex": &msg},
+		ComplexMap:  map[string]*util_message.ExampleMessage{"complex": &exampleMsg},
 		SimpleList:  []string{"Hello", "World"},
-		ComplexList: []*util_message.ExampleMessage{&msg},
+		ComplexList: []*util_message.ExampleMessage{&exampleMsg},
+	}
+
+	schema := `schema:
+  fields: 22
+    - str: type=utf8, nullable
+    - int32: type=int32, nullable
+    - int64: type=int64, nullable
+    - sint32: type=int32, nullable
+    - sin64: type=int64, nullable
+    - uint32: type=uint32, nullable
+    - uint64: type=uint64, nullable
+    - fixed32: type=uint32, nullable
+    - fixed64: type=uint64, nullable
+    - sfixed32: type=int32, nullable
+    - bool: type=bool, nullable
+    - bytes: type=binary, nullable
+    - double: type=float64, nullable
+    - enum: type=dictionary<values=utf8, indices=int32, ordered=false>, nullable
+    - message: type=struct<field1: utf8>, nullable
+    - oneofstring: type=utf8, nullable
+    - oneofmessage: type=struct<field1: utf8>, nullable
+    - any: type=struct<field1: utf8>, nullable
+    - simple_map: type=map<int32, utf8, items_nullable>, nullable
+    - complex_map: type=map<utf8, struct<field1: utf8>, items_nullable>, nullable
+    - simple_list: type=list<item: utf8, nullable>, nullable
+    - complex_list: type=list<item: struct<field1: utf8>, nullable>, nullable`
+
+	return Fixture{
+		msg:     &msg,
+		schema:  schema,
+		jsonStr: jsonString,
 	}
 }
 
-func TestGetSchema(t *testing.T) {
-	msg := SetupTest()
+func AllTheTypesNoAnyFixture() Fixture {
+	exampleMsg := util_message.ExampleMessage{
+		Field1: "Example",
+	}
 
-	got := NewProtobufMessageReflection(&msg).Schema().String()
-	want := `schema:
+	msg := util_message.AllTheTypesNoAny{
+		Str:      "Hello",
+		Int32:    10,
+		Int64:    100,
+		Sint32:   -10,
+		Sin64:    -100,
+		Uint32:   10,
+		Uint64:   100,
+		Fixed32:  10,
+		Fixed64:  1000,
+		Sfixed32: 10,
+		Bool:     false,
+		Bytes:    []byte("Hello, world!"),
+		Double:   1.1,
+		Enum:     util_message.AllTheTypesNoAny_OPTION_1,
+		Message:  &exampleMsg,
+		Oneof:    &util_message.AllTheTypesNoAny_Oneofstring{Oneofstring: "World"},
+		//Breaks the test as the Golang maps have a non-deterministic order
+		//SimpleMap:   map[int32]string{99: "Hello", 100: "World", 98: "How", 101: "Are", 1: "You"},
+		SimpleMap:   map[int32]string{99: "Hello"},
+		ComplexMap:  map[string]*util_message.ExampleMessage{"complex": &exampleMsg},
+		SimpleList:  []string{"Hello", "World"},
+		ComplexList: []*util_message.ExampleMessage{&exampleMsg},
+	}
+
+	schema := `schema:
   fields: 22
     - str: type=utf8, nullable
     - int32: type=int32, nullable
@@ -87,16 +185,62 @@ func TestGetSchema(t *testing.T) {
     - message: type=struct<field1: utf8>, nullable
     - oneofstring: type=utf8, nullable
     - oneofmessage: type=struct<field1: utf8>, nullable
-    - any: type=struct<field1: utf8>, nullable
     - simple_map: type=map<int32, utf8, items_nullable>, nullable
     - complex_map: type=map<utf8, struct<field1: utf8>, items_nullable>, nullable
     - simple_list: type=list<item: utf8, nullable>, nullable
     - complex_list: type=list<item: struct<field1: utf8>, nullable>, nullable`
 
-	require.Equal(t, want, got, "got: %s\nwant: %s", got, want)
+	jsonStr := `{
+			"str":"Hello",
+			"int32":10,
+			"int64":100,
+			"sint32":-10,
+			"sin64":-100,
+			"uint32":10,
+			"uint64":100,
+			"fixed32":10,
+			"fixed64":1000,
+			"sfixed32":10,
+			"bool":false,
+			"bytes":"SGVsbG8sIHdvcmxkIQ==",
+			"double":1.1,
+			"enum":"OPTION_1",
+			"message":{"field1":"Example"},
+			"oneofmessage": { "field1": null },
+			"oneofstring": "World",
+			"simple_map":[{"key":99,"value":"Hello"}],
+			"complex_map":[{"key":"complex","value":{"field1":"Example"}}],
+			"simple_list":["Hello","World"],
+			"complex_list":[{"field1":"Example"}]
+		}`
+
+	return Fixture{
+		msg:     &msg,
+		schema:  schema,
+		jsonStr: jsonStr,
+	}
+}
 
-	got = NewProtobufMessageReflection(&msg, WithOneOfHandler(OneOfDenseUnion)).Schema().String()
-	want = `schema:
+func CheckSchema(t *testing.T, pmr *ProtobufMessageReflection, want string) {
+	got := pmr.Schema().String()
+	require.Equal(t, got, want, "got: %s\nwant: %s", got, want)
+}
+
+func CheckRecord(t *testing.T, pmr *ProtobufMessageReflection, jsonStr string) {
+	rec := pmr.Record(nil)
+	got, err := json.Marshal(rec)
+	assert.NoError(t, err)
+	assert.JSONEq(t, jsonStr, string(got), "got: %s\nwant: %s", got, jsonStr)
+}
+
+func TestGetSchema(t *testing.T) {
+	f := AllTheTypesFixture()
+
+	pmr := NewProtobufMessageReflection(f.msg)
+	CheckSchema(t, pmr, f.schema)
+
+	pmr = NewProtobufMessageReflection(f.msg, WithOneOfHandler(OneOfDenseUnion))
+	want := `schema:
   fields: 21
     - str: type=utf8, nullable
     - int32: type=int32, nullable
@@ -119,14 +263,13 @@ func TestGetSchema(t *testing.T) {
     - complex_map: type=map<utf8, struct<field1: utf8>, items_nullable>, nullable
     - simple_list: type=list<item: utf8, nullable>, nullable
     - complex_list: type=list<item: struct<field1: utf8>, nullable>, nullable`
-
-	require.Equal(t, want, got, "got: %s\nwant: %s", got, want)
+	CheckSchema(t, pmr, want)
 
 	excludeComplex := func(pfr *ProtobufFieldReflection) bool {
 		return pfr.isMap() || pfr.isList() || pfr.isStruct()
 	}
 
-	got = NewProtobufMessageReflection(&msg, WithExclusionPolicy(excludeComplex)).Schema().String()
+	pmr = NewProtobufMessageReflection(f.msg, WithExclusionPolicy(excludeComplex))
 	want = `schema:
   fields: 15
     - str: type=utf8, nullable
@@ -144,14 +287,13 @@ func TestGetSchema(t *testing.T) {
     - double: type=float64, nullable
     - enum: type=dictionary<values=utf8, indices=int32, ordered=false>, nullable
     - oneofstring: type=utf8, nullable`
+	CheckSchema(t, pmr, want)
 
-	require.Equal(t, want, got, "got: %s\nwant: %s", got, want)
-
-	got = NewProtobufMessageReflection(
-		&msg,
+	pmr = NewProtobufMessageReflection(
+		f.msg,
 		WithExclusionPolicy(excludeComplex),
 		WithFieldNameFormatter(xstrings.ToCamelCase),
-	).Schema().String()
+	)
 	want = `schema:
   fields: 15
     - Str: type=utf8, nullable
@@ -169,123 +311,168 @@ func TestGetSchema(t *testing.T) {
     - Double: type=float64, nullable
     - Enum: type=dictionary<values=utf8, indices=int32, ordered=false>, nullable
     - Oneofstring: type=utf8, nullable`
-
-	require.Equal(t, want, got, "got: %s\nwant: %s", got, want)
+	CheckSchema(t, pmr, want)
 
 	onlyEnum := func(pfr *ProtobufFieldReflection) bool {
 		return !pfr.isEnum()
 	}
-	got = NewProtobufMessageReflection(
-		&msg,
+	pmr = NewProtobufMessageReflection(
+		f.msg,
 		WithExclusionPolicy(onlyEnum),
 		WithEnumHandler(EnumNumber),
-	).Schema().String()
+	)
 	want = `schema:
   fields: 1
     - enum: type=int32, nullable`
+	CheckSchema(t, pmr, want)
 
-	require.Equal(t, want, got, "got: %s\nwant: %s", got, want)
-
-	got = NewProtobufMessageReflection(
-		&msg,
+	pmr = NewProtobufMessageReflection(
+		f.msg,
 		WithExclusionPolicy(onlyEnum),
 		WithEnumHandler(EnumValue),
-	).Schema().String()
+	)
 	want = `schema:
   fields: 1
     - enum: type=utf8, nullable`
-
-	require.Equal(t, want, got, "got: %s\nwant: %s", got, want)
+	CheckSchema(t, pmr, want)
 }
 
 func TestRecordFromProtobuf(t *testing.T) {
-	msg := SetupTest()
-
-	pmr := NewProtobufMessageReflection(&msg, WithOneOfHandler(OneOfDenseUnion))
-	schema := pmr.Schema()
-	got := pmr.Record(nil)
-	jsonStr := `[
-		{
-			"str":"Hello",
-			"int32":10,
-			"int64":100,
-			"sint32":-10,
-			"sin64":-100,
-			"uint32":10,
-			"uint64":100,
-			"fixed32":10,
-			"fixed64":1000,
-			"sfixed32":10,
-			"bool":false,
-			"bytes":"SGVsbG8sIHdvcmxkIQ==",
-			"double":1.1,
-			"enum":"OPTION_1",
-			"message":{"field1":"Example"},
-			"oneof": [0, "World"],
-			"any":{"field1":"Example"},
-			"simple_map":[{"key":99,"value":"Hello"}],
-			"complex_map":[{"key":"complex","value":{"field1":"Example"}}],
-			"simple_list":["Hello","World"],
-			"complex_list":[{"field1":"Example"}]
-		}
-	]`
-	want, _, err := array.RecordFromJSON(memory.NewGoAllocator(), schema, strings.NewReader(jsonStr))
+	f := AllTheTypesFixture()
 
-	require.NoError(t, err)
-	require.EqualExportedValues(t, got, want, "got: %s\nwant: %s", got, want)
+	pmr := NewProtobufMessageReflection(f.msg, WithOneOfHandler(OneOfDenseUnion))
+	CheckRecord(t, pmr, fmt.Sprintf(`[%s]`, f.jsonStr))
 
 	onlyEnum := func(pfr *ProtobufFieldReflection) bool { return !pfr.isEnum() }
-	pmr = NewProtobufMessageReflection(&msg, WithExclusionPolicy(onlyEnum), WithEnumHandler(EnumValue))
-	got = pmr.Record(nil)
-	jsonStr = `[ { "enum":"OPTION_1" } ]`
-	want, _, err = array.RecordFromJSON(memory.NewGoAllocator(), pmr.Schema(), strings.NewReader(jsonStr))
-	require.NoError(t, err)
-	require.True(t, array.RecordEqual(got, want), "got: %s\nwant: %s", got, want)
-
-	pmr = NewProtobufMessageReflection(&msg, WithExclusionPolicy(onlyEnum), WithEnumHandler(EnumNumber))
-	got = pmr.Record(nil)
-	jsonStr = `[ { "enum":"1" } ]`
-	want, _, err = array.RecordFromJSON(memory.NewGoAllocator(), pmr.Schema(), strings.NewReader(jsonStr))
-	require.NoError(t, err)
-	require.True(t, array.RecordEqual(got, want), "got: %s\nwant: %s", got, want)
+	pmr = NewProtobufMessageReflection(f.msg, WithExclusionPolicy(onlyEnum), WithEnumHandler(EnumValue))
+	jsonStr := `[ { "enum":"OPTION_1" } ]`
+	CheckRecord(t, pmr, jsonStr)
+
+	pmr = NewProtobufMessageReflection(f.msg, WithExclusionPolicy(onlyEnum), WithEnumHandler(EnumNumber))
+	jsonStr = `[ { "enum":1 } ]`
+	CheckRecord(t, pmr, jsonStr)
 }
 
 func TestNullRecordFromProtobuf(t *testing.T) {
 	pmr := NewProtobufMessageReflection(&util_message.AllTheTypes{})
-	schema := pmr.Schema()
-	got := pmr.Record(nil)
-	_, _ = got.MarshalJSON()
-	jsonStr := `[
-		{
-			"str":"",
-			"int32":0,
-			"int64":0,
-			"sint32":0,
-			"sin64":0,
-			"uint32":0,
-			"uint64":0,
-			"fixed32":0,
-			"fixed64":0,
-			"sfixed32":0,
-			"bool":false,
-			"bytes":"",
-			"double":0,
-			"enum":"OPTION_0",
-			"message":null,
-			"oneofmessage":{"field1":""},
-			"oneofstring":"",
-			"any":null,
-			"simple_map":[],
-			"complex_map":[],
-			"simple_list":[],
-			"complex_list":[]
-		}
-	]`
-
-	want, _, err := array.RecordFromJSON(memory.NewGoAllocator(), schema, strings.NewReader(jsonStr))
-
-	require.NoError(t, err)
-	require.EqualExportedValues(t, got, want, "got: %s\nwant: %s", got, want)
+	CheckRecord(t, pmr, `[{
+		"str":"",
+		"int32":0,
+		"int64":0,
+		"sint32":0,
+		"sin64":0,
+		"uint32":0,
+		"uint64":0,
+		"fixed32":0,
+		"fixed64":0,
+		"sfixed32":0,
+		"bool":false,
+		"bytes":null,
+		"double":0,
+		"enum":"OPTION_0",
+		"message":null,
+		"oneofmessage":{"field1":""},
+		"oneofstring":"",
+		"any": null,
+		"simple_map":[],
+		"complex_map":[],
+		"simple_list":[],
+		"complex_list":[]
+	}]`)
+}
+
+func TestExcludedNested(t *testing.T) {
+	msg := util_message.ExampleMessage{
+		Field1: "Example",
+	}
+	schema := `schema:
+  fields: 2
+    - simple_a: type=list<item: struct<field1: utf8>, nullable>, nullable
+    - simple_b: type=list<item: struct<field1: utf8>, nullable>, nullable`
+
+	simpleNested := util_message.SimpleNested{
+		SimpleA: []*util_message.ExampleMessage{&msg},
+		SimpleB: []*util_message.ExampleMessage{&msg},
+	}
+	pmr := NewProtobufMessageReflection(&simpleNested)
+	jsonStr := `[{ "simple_a":[{"field1":"Example"}], "simple_b":[{"field1":"Example"}] }]`
+	CheckSchema(t, pmr, schema)
+	CheckRecord(t, pmr, jsonStr)
+
+	//exclude one value
+	simpleNested = util_message.SimpleNested{
+		SimpleA: []*util_message.ExampleMessage{&msg},
+	}
+	jsonStr = `[{ "simple_a":[{"field1":"Example"}], "simple_b":[]}]`
+	CheckSchema(t, pmr, schema)
+	CheckRecord(t, pmr, jsonStr)
+
+	////exclude both values
+	simpleNested = util_message.SimpleNested{}
+	jsonStr = `[{ "simple_a":[], "simple_b":[] }]`
+	CheckSchema(t, pmr, schema)
+	CheckRecord(t, pmr, jsonStr)
+
+	f := AllTheTypesNoAnyFixture()
+	schema = `schema:
+  fields: 2
+    - all_the_types_no_any_a: type=list<item: struct<str: utf8, int32: int32, int64: int64, sint32: int32, sin64: int64, uint32: uint32, uint64: uint64, fixed32: uint32, fixed64: uint64, sfixed32: int32, bool: bool, bytes: binary, double: float64, enum: dictionary<values=utf8, indices=int32, ordered=false>, message: struct<field1: utf8>, oneofstring: utf8, oneofmessage: struct<field1: utf8>, simple_map: map<int32, utf8, items_nullable>, complex_map: map<utf8, struct<field1: utf8>, items_nullable>, simple_list: list<item: utf8, nullable>, complex_list: list<item: struct<field1: utf8>, nullable>>, nullable>, nullable
+    - all_the_types_no_any_b: type=list<item: struct<str: utf8, int32: int32, int64: int64, sint32: int32, sin64: int64, uint32: uint32, uint64: uint64, fixed32: uint32, fixed64: uint64, sfixed32: int32, bool: bool, bytes: binary, double: float64, enum: dictionary<values=utf8, indices=int32, ordered=false>, message: struct<field1: utf8>, oneofstring: utf8, oneofmessage: struct<field1: utf8>, simple_map: map<int32, utf8, items_nullable>, complex_map: map<utf8, struct<field1: utf8>, items_nullable>, simple_list: list<item: utf8, nullable>, complex_list: list<item: struct<field1: utf8>, nullable>>, nullable>, nullable`
+
+	complexNested := util_message.ComplexNested{
+		AllTheTypesNoAnyA: []*util_message.AllTheTypesNoAny{f.msg.(*util_message.AllTheTypesNoAny)},
+		AllTheTypesNoAnyB: []*util_message.AllTheTypesNoAny{f.msg.(*util_message.AllTheTypesNoAny)},
+	}
+	jsonStr = fmt.Sprintf(`[{ "all_the_types_no_any_a": [%s], "all_the_types_no_any_b": [%s] }]`, f.jsonStr, f.jsonStr)
+	pmr = NewProtobufMessageReflection(&complexNested)
+	CheckSchema(t, pmr, schema)
+	CheckRecord(t, pmr, jsonStr)
+
+	// exclude one value
+	complexNested = util_message.ComplexNested{
+		AllTheTypesNoAnyB: []*util_message.AllTheTypesNoAny{f.msg.(*util_message.AllTheTypesNoAny)},
+	}
+	jsonStr = fmt.Sprintf(`[{ "all_the_types_no_any_a": [], "all_the_types_no_any_b": [%s] }]`, f.jsonStr)
+	pmr = NewProtobufMessageReflection(&complexNested)
+	CheckSchema(t, pmr, schema)
+	CheckRecord(t, pmr, jsonStr)
+
+	// exclude both values
+	complexNested = util_message.ComplexNested{}
+	jsonStr = `[{ "all_the_types_no_any_a": [], "all_the_types_no_any_b": [] }]`
+	pmr = NewProtobufMessageReflection(&complexNested)
+	CheckSchema(t, pmr, schema)
+	CheckRecord(t, pmr, jsonStr)
+
+	schema = `schema:
+  fields: 2
+    - complex_nested: type=struct<all_the_types_no_any_a: list<item: struct<str: utf8, int32: int32, int64: int64, sint32: int32, sin64: int64, uint32: uint32, uint64: uint64, fixed32: uint32, fixed64: uint64, sfixed32: int32, bool: bool, bytes: binary, double: float64, enum: dictionary<values=utf8, indices=int32, ordered=false>, message: struct<field1: utf8>, oneofstring: utf8, oneofmessage: struct<field1: utf8>, simple_map: map<int32, utf8, items_nullable>, complex_map: map<utf8, struct<field1: utf8>, items_nullable>, simple_list: list<item: utf8, nullable>, complex_list: list<item: struct<field1: utf8>, nullable>>, nullable>, all_the_types_no_any_b: list<item: struct<str: utf8, int32: int32, int64: int64, sint32: int32, sin64: int64, uint32: uint32, uint64: uint64, fixed32: uint32, fixed64: uint64, sfixed32: int32, bool: bool, bytes: binary, double: float64, enum: dictionary<values=utf8, indices=int32, ordered=false>, message: struct<field1: utf8>, oneofstring: utf8, oneofmessage: struct<field1: utf8>, simple_map: map<int32, utf8, items_nullable>, complex_map: map<utf8, struct<field1: utf8>, items_nullable>, simple_list: list<item: utf8, nullable>, complex_list: list<item: struct<field1: utf8>, nullable>>, nullable>>, nullable
+    - simple_nested: type=struct<simple_a: list<item: struct<field1: utf8>, nullable>, simple_b: list<item: struct<field1: utf8>, nullable>>, nullable`
+
+	deepNested := util_message.DeepNested{
+		ComplexNested: &complexNested,
+		SimpleNested:  &simpleNested,
+	}
+	jsonStr = `[{ "simple_nested": {"simple_a":[], "simple_b":[]}, "complex_nested": {"all_the_types_no_any_a": [], "all_the_types_no_any_b": []} }]`
+	pmr = NewProtobufMessageReflection(&deepNested)
+	CheckSchema(t, pmr, schema)
+	CheckRecord(t, pmr, jsonStr)
+
+	// exclude one value
+	deepNested = util_message.DeepNested{
+		ComplexNested: &complexNested,
+	}
+	jsonStr = `[{ "simple_nested": null, "complex_nested": {"all_the_types_no_any_a": [], "all_the_types_no_any_b": []} }]`
+	pmr = NewProtobufMessageReflection(&deepNested)
+	CheckSchema(t, pmr, schema)
+	CheckRecord(t, pmr, jsonStr)
+
+	// exclude both values
+	deepNested = util_message.DeepNested{}
+	pmr = NewProtobufMessageReflection(&deepNested)
+	jsonStr = `[{ "simple_nested": null, "complex_nested": null }]`
+	CheckSchema(t, pmr, schema)
+	CheckRecord(t, pmr, jsonStr)
 }
 
 type testProtobufReflection struct {
diff --git a/go/arrow/util/util_message/types.pb.go b/go/arrow/util/util_message/types.pb.go
index 80e18847c19..6486b2cc87a 100644
--- a/go/arrow/util/util_message/types.pb.go
+++ b/go/arrow/util/util_message/types.pb.go
@@ -23,12 +23,11 @@
 package util_message
 
 import (
-	reflect "reflect"
-	sync "sync"
-
 	protoreflect "google.golang.org/protobuf/reflect/protoreflect"
 	protoimpl "google.golang.org/protobuf/runtime/protoimpl"
 	anypb "google.golang.org/protobuf/types/known/anypb"
+	reflect "reflect"
+	sync "sync"
 )
 
 const (
@@ -84,6 +83,52 @@ func (AllTheTypes_ExampleEnum) EnumDescriptor() ([]byte, []int) {
 	return file_messages_types_proto_rawDescGZIP(), []int{1, 0}
 }
 
+type AllTheTypesNoAny_ExampleEnum int32
+
+const (
+	AllTheTypesNoAny_OPTION_0 AllTheTypesNoAny_ExampleEnum = 0
+	AllTheTypesNoAny_OPTION_1 AllTheTypesNoAny_ExampleEnum = 1
+)
+
+// Enum value maps for AllTheTypesNoAny_ExampleEnum.
+var (
+	AllTheTypesNoAny_ExampleEnum_name = map[int32]string{
+		0: "OPTION_0",
+		1: "OPTION_1",
+	}
+	AllTheTypesNoAny_ExampleEnum_value = map[string]int32{
+		"OPTION_0": 0,
+		"OPTION_1": 1,
+	}
+)
+
+func (x AllTheTypesNoAny_ExampleEnum) Enum() *AllTheTypesNoAny_ExampleEnum {
+	p := new(AllTheTypesNoAny_ExampleEnum)
+	*p = x
+	return p
+}
+
+func (x AllTheTypesNoAny_ExampleEnum) String() string {
+	return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x))
+}
+
+func (AllTheTypesNoAny_ExampleEnum) Descriptor() protoreflect.EnumDescriptor {
+	return file_messages_types_proto_enumTypes[1].Descriptor()
+}
+
+func (AllTheTypesNoAny_ExampleEnum) Type() protoreflect.EnumType {
+	return &file_messages_types_proto_enumTypes[1]
+}
+
+func (x AllTheTypesNoAny_ExampleEnum) Number() protoreflect.EnumNumber {
+	return protoreflect.EnumNumber(x)
+}
+
+// Deprecated: Use AllTheTypesNoAny_ExampleEnum.Descriptor instead.
+func (AllTheTypesNoAny_ExampleEnum) EnumDescriptor() ([]byte, []int) {
+	return file_messages_types_proto_rawDescGZIP(), []int{2, 0}
+}
+
 type ExampleMessage struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
@@ -372,6 +417,404 @@ func (*AllTheTypes_Oneofstring) isAllTheTypes_Oneof() {}
 
 func (*AllTheTypes_Oneofmessage) isAllTheTypes_Oneof() {}
 
+type AllTheTypesNoAny struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	Str      string                       `protobuf:"bytes,1,opt,name=str,proto3" json:"str,omitempty"`
+	Int32    int32                        `protobuf:"varint,2,opt,name=int32,proto3" json:"int32,omitempty"`
+	Int64    int64                        `protobuf:"varint,3,opt,name=int64,proto3" json:"int64,omitempty"`
+	Sint32   int32                        `protobuf:"zigzag32,4,opt,name=sint32,proto3" json:"sint32,omitempty"`
+	Sin64    int64                        `protobuf:"zigzag64,5,opt,name=sin64,proto3" json:"sin64,omitempty"`
+	Uint32   uint32                       `protobuf:"varint,6,opt,name=uint32,proto3" json:"uint32,omitempty"`
+	Uint64   uint64                       `protobuf:"varint,7,opt,name=uint64,proto3" json:"uint64,omitempty"`
+	Fixed32  uint32                       `protobuf:"fixed32,8,opt,name=fixed32,proto3" json:"fixed32,omitempty"`
+	Fixed64  uint64                       `protobuf:"fixed64,9,opt,name=fixed64,proto3" json:"fixed64,omitempty"`
+	Sfixed32 int32                        `protobuf:"fixed32,10,opt,name=sfixed32,proto3" json:"sfixed32,omitempty"`
+	Bool     bool                         `protobuf:"varint,11,opt,name=bool,proto3" json:"bool,omitempty"`
+	Bytes    []byte                       `protobuf:"bytes,12,opt,name=bytes,proto3" json:"bytes,omitempty"`
+	Double   float64                      `protobuf:"fixed64,13,opt,name=double,proto3" json:"double,omitempty"`
+	Enum     AllTheTypesNoAny_ExampleEnum `protobuf:"varint,14,opt,name=enum,proto3,enum=AllTheTypesNoAny_ExampleEnum" json:"enum,omitempty"`
+	Message  *ExampleMessage              `protobuf:"bytes,15,opt,name=message,proto3" json:"message,omitempty"`
+	// Types that are assignable to Oneof:
+	//
+	//	*AllTheTypesNoAny_Oneofstring
+	//	*AllTheTypesNoAny_Oneofmessage
+	Oneof       isAllTheTypesNoAny_Oneof   `protobuf_oneof:"oneof"`
+	SimpleMap   map[int32]string           `protobuf:"bytes,19,rep,name=simple_map,json=simpleMap,proto3" json:"simple_map,omitempty" protobuf_key:"varint,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
+	ComplexMap  map[string]*ExampleMessage `protobuf:"bytes,20,rep,name=complex_map,json=complexMap,proto3" json:"complex_map,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
+	SimpleList  []string                   `protobuf:"bytes,21,rep,name=simple_list,json=simpleList,proto3" json:"simple_list,omitempty"`
+	ComplexList []*ExampleMessage          `protobuf:"bytes,22,rep,name=complex_list,json=complexList,proto3" json:"complex_list,omitempty"`
+}
+
+func (x *AllTheTypesNoAny) Reset() {
+	*x = AllTheTypesNoAny{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_messages_types_proto_msgTypes[2]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *AllTheTypesNoAny) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*AllTheTypesNoAny) ProtoMessage() {}
+
+func (x *AllTheTypesNoAny) ProtoReflect() protoreflect.Message {
+	mi := &file_messages_types_proto_msgTypes[2]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use AllTheTypesNoAny.ProtoReflect.Descriptor instead.
+func (*AllTheTypesNoAny) Descriptor() ([]byte, []int) {
+	return file_messages_types_proto_rawDescGZIP(), []int{2}
+}
+
+func (x *AllTheTypesNoAny) GetStr() string {
+	if x != nil {
+		return x.Str
+	}
+	return ""
+}
+
+func (x *AllTheTypesNoAny) GetInt32() int32 {
+	if x != nil {
+		return x.Int32
+	}
+	return 0
+}
+
+func (x *AllTheTypesNoAny) GetInt64() int64 {
+	if x != nil {
+		return x.Int64
+	}
+	return 0
+}
+
+func (x *AllTheTypesNoAny) GetSint32() int32 {
+	if x != nil {
+		return x.Sint32
+	}
+	return 0
+}
+
+func (x *AllTheTypesNoAny) GetSin64() int64 {
+	if x != nil {
+		return x.Sin64
+	}
+	return 0
+}
+
+func (x *AllTheTypesNoAny) GetUint32() uint32 {
+	if x != nil {
+		return x.Uint32
+	}
+	return 0
+}
+
+func (x *AllTheTypesNoAny) GetUint64() uint64 {
+	if x != nil {
+		return x.Uint64
+	}
+	return 0
+}
+
+func (x *AllTheTypesNoAny) GetFixed32() uint32 {
+	if x != nil {
+		return x.Fixed32
+	}
+	return 0
+}
+
+func (x *AllTheTypesNoAny) GetFixed64() uint64 {
+	if x != nil {
+		return x.Fixed64
+	}
+	return 0
+}
+
+func (x *AllTheTypesNoAny) GetSfixed32() int32 {
+	if x != nil {
+		return x.Sfixed32
+	}
+	return 0
+}
+
+func (x *AllTheTypesNoAny) GetBool() bool {
+	if x != nil {
+		return x.Bool
+	}
+	return false
+}
+
+func (x *AllTheTypesNoAny) GetBytes() []byte {
+	if x != nil {
+		return x.Bytes
+	}
+	return nil
+}
+
+func (x *AllTheTypesNoAny) GetDouble() float64 {
+	if x != nil {
+		return x.Double
+	}
+	return 0
+}
+
+func (x *AllTheTypesNoAny) GetEnum() AllTheTypesNoAny_ExampleEnum {
+	if x != nil {
+		return x.Enum
+	}
+	return AllTheTypesNoAny_OPTION_0
+}
+
+func (x *AllTheTypesNoAny) GetMessage() *ExampleMessage {
+	if x != nil {
+		return x.Message
+	}
+	return nil
+}
+
+func (m *AllTheTypesNoAny) GetOneof() isAllTheTypesNoAny_Oneof {
+	if m != nil {
+		return m.Oneof
+	}
+	return nil
+}
+
+func (x *AllTheTypesNoAny) GetOneofstring() string {
+	if x, ok := x.GetOneof().(*AllTheTypesNoAny_Oneofstring); ok {
+		return x.Oneofstring
+	}
+	return ""
+}
+
+func (x *AllTheTypesNoAny) GetOneofmessage() *ExampleMessage {
+	if x, ok := x.GetOneof().(*AllTheTypesNoAny_Oneofmessage); ok {
+		return x.Oneofmessage
+	}
+	return nil
+}
+
+func (x *AllTheTypesNoAny) GetSimpleMap() map[int32]string {
+	if x != nil {
+		return x.SimpleMap
+	}
+	return nil
+}
+
+func (x *AllTheTypesNoAny) GetComplexMap() map[string]*ExampleMessage {
+	if x != nil {
+		return x.ComplexMap
+	}
+	return nil
+}
+
+func (x *AllTheTypesNoAny) GetSimpleList() []string {
+	if x != nil {
+		return x.SimpleList
+	}
+	return nil
+}
+
+func (x *AllTheTypesNoAny) GetComplexList() []*ExampleMessage {
+	if x != nil {
+		return x.ComplexList
+	}
+	return nil
+}
+
+type isAllTheTypesNoAny_Oneof interface {
+	isAllTheTypesNoAny_Oneof()
+}
+
+type AllTheTypesNoAny_Oneofstring struct {
+	Oneofstring string `protobuf:"bytes,16,opt,name=oneofstring,proto3,oneof"`
+}
+
+type AllTheTypesNoAny_Oneofmessage struct {
+	Oneofmessage *ExampleMessage `protobuf:"bytes,17,opt,name=oneofmessage,proto3,oneof"`
+}
+
+func (*AllTheTypesNoAny_Oneofstring) isAllTheTypesNoAny_Oneof() {}
+
+func (*AllTheTypesNoAny_Oneofmessage) isAllTheTypesNoAny_Oneof() {}
+
+type SimpleNested struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	SimpleA []*ExampleMessage `protobuf:"bytes,1,rep,name=simple_a,json=simpleA,proto3" json:"simple_a,omitempty"`
+	SimpleB []*ExampleMessage `protobuf:"bytes,2,rep,name=simple_b,json=simpleB,proto3" json:"simple_b,omitempty"`
+}
+
+func (x *SimpleNested) Reset() {
+	*x = SimpleNested{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_messages_types_proto_msgTypes[3]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *SimpleNested) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*SimpleNested) ProtoMessage() {}
+
+func (x *SimpleNested) ProtoReflect() protoreflect.Message {
+	mi := &file_messages_types_proto_msgTypes[3]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use SimpleNested.ProtoReflect.Descriptor instead.
+func (*SimpleNested) Descriptor() ([]byte, []int) {
+	return file_messages_types_proto_rawDescGZIP(), []int{3}
+}
+
+func (x *SimpleNested) GetSimpleA() []*ExampleMessage {
+	if x != nil {
+		return x.SimpleA
+	}
+	return nil
+}
+
+func (x *SimpleNested) GetSimpleB() []*ExampleMessage {
+	if x != nil {
+		return x.SimpleB
+	}
+	return nil
+}
+
+type ComplexNested struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	AllTheTypesNoAnyA []*AllTheTypesNoAny `protobuf:"bytes,1,rep,name=all_the_types_no_any_a,json=allTheTypesNoAnyA,proto3" json:"all_the_types_no_any_a,omitempty"`
+	AllTheTypesNoAnyB []*AllTheTypesNoAny `protobuf:"bytes,2,rep,name=all_the_types_no_any_b,json=allTheTypesNoAnyB,proto3" json:"all_the_types_no_any_b,omitempty"`
+}
+
+func (x *ComplexNested) Reset() {
+	*x = ComplexNested{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_messages_types_proto_msgTypes[4]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *ComplexNested) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*ComplexNested) ProtoMessage() {}
+
+func (x *ComplexNested) ProtoReflect() protoreflect.Message {
+	mi := &file_messages_types_proto_msgTypes[4]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use ComplexNested.ProtoReflect.Descriptor instead.
+func (*ComplexNested) Descriptor() ([]byte, []int) {
+	return file_messages_types_proto_rawDescGZIP(), []int{4}
+}
+
+func (x *ComplexNested) GetAllTheTypesNoAnyA() []*AllTheTypesNoAny {
+	if x != nil {
+		return x.AllTheTypesNoAnyA
+	}
+	return nil
+}
+
+func (x *ComplexNested) GetAllTheTypesNoAnyB() []*AllTheTypesNoAny {
+	if x != nil {
+		return x.AllTheTypesNoAnyB
+	}
+	return nil
+}
+
+type DeepNested struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	ComplexNested *ComplexNested `protobuf:"bytes,1,opt,name=complex_nested,json=complexNested,proto3" json:"complex_nested,omitempty"`
+	SimpleNested  *SimpleNested  `protobuf:"bytes,2,opt,name=simple_nested,json=simpleNested,proto3" json:"simple_nested,omitempty"`
+}
+
+func (x *DeepNested) Reset() {
+	*x = DeepNested{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_messages_types_proto_msgTypes[5]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *DeepNested) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*DeepNested) ProtoMessage() {}
+
+func (x *DeepNested) ProtoReflect() protoreflect.Message {
+	mi := &file_messages_types_proto_msgTypes[5]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use DeepNested.ProtoReflect.Descriptor instead.
+func (*DeepNested) Descriptor() ([]byte, []int) {
+	return file_messages_types_proto_rawDescGZIP(), []int{5}
+}
+
+func (x *DeepNested) GetComplexNested() *ComplexNested {
+	if x != nil {
+		return x.ComplexNested
+	}
+	return nil
+}
+
+func (x *DeepNested) GetSimpleNested() *SimpleNested {
+	if x != nil {
+		return x.SimpleNested
+	}
+	return nil
+}
+
 var File_messages_types_proto protoreflect.FileDescriptor
 
 var file_messages_types_proto_rawDesc = []byte{
@@ -439,9 +882,90 @@ var file_messages_types_proto_rawDesc = []byte{
 	0x02, 0x38, 0x01, 0x22, 0x29, 0x0a, 0x0b, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x45, 0x6e,
 	0x75, 0x6d, 0x12, 0x0c, 0x0a, 0x08, 0x4f, 0x50, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x30, 0x10, 0x00,
 	0x12, 0x0c, 0x0a, 0x08, 0x4f, 0x50, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x31, 0x10, 0x01, 0x42, 0x07,
-	0x0a, 0x05, 0x6f, 0x6e, 0x65, 0x6f, 0x66, 0x42, 0x11, 0x5a, 0x0f, 0x2e, 0x2e, 0x2f, 0x75, 0x74,
-	0x69, 0x6c, 0x5f, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74,
-	0x6f, 0x33,
+	0x0a, 0x05, 0x6f, 0x6e, 0x65, 0x6f, 0x66, 0x22, 0x95, 0x07, 0x0a, 0x10, 0x41, 0x6c, 0x6c, 0x54,
+	0x68, 0x65, 0x54, 0x79, 0x70, 0x65, 0x73, 0x4e, 0x6f, 0x41, 0x6e, 0x79, 0x12, 0x10, 0x0a, 0x03,
+	0x73, 0x74, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x74, 0x72, 0x12, 0x14,
+	0x0a, 0x05, 0x69, 0x6e, 0x74, 0x33, 0x32, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x69,
+	0x6e, 0x74, 0x33, 0x32, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x18, 0x03, 0x20,
+	0x01, 0x28, 0x03, 0x52, 0x05, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x69,
+	0x6e, 0x74, 0x33, 0x32, 0x18, 0x04, 0x20, 0x01, 0x28, 0x11, 0x52, 0x06, 0x73, 0x69, 0x6e, 0x74,
+	0x33, 0x32, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x69, 0x6e, 0x36, 0x34, 0x18, 0x05, 0x20, 0x01, 0x28,
+	0x12, 0x52, 0x05, 0x73, 0x69, 0x6e, 0x36, 0x34, 0x12, 0x16, 0x0a, 0x06, 0x75, 0x69, 0x6e, 0x74,
+	0x33, 0x32, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x75, 0x69, 0x6e, 0x74, 0x33, 0x32,
+	0x12, 0x16, 0x0a, 0x06, 0x75, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x18, 0x07, 0x20, 0x01, 0x28, 0x04,
+	0x52, 0x06, 0x75, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x12, 0x18, 0x0a, 0x07, 0x66, 0x69, 0x78, 0x65,
+	0x64, 0x33, 0x32, 0x18, 0x08, 0x20, 0x01, 0x28, 0x07, 0x52, 0x07, 0x66, 0x69, 0x78, 0x65, 0x64,
+	0x33, 0x32, 0x12, 0x18, 0x0a, 0x07, 0x66, 0x69, 0x78, 0x65, 0x64, 0x36, 0x34, 0x18, 0x09, 0x20,
+	0x01, 0x28, 0x06, 0x52, 0x07, 0x66, 0x69, 0x78, 0x65, 0x64, 0x36, 0x34, 0x12, 0x1a, 0x0a, 0x08,
+	0x73, 0x66, 0x69, 0x78, 0x65, 0x64, 0x33, 0x32, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x0f, 0x52, 0x08,
+	0x73, 0x66, 0x69, 0x78, 0x65, 0x64, 0x33, 0x32, 0x12, 0x12, 0x0a, 0x04, 0x62, 0x6f, 0x6f, 0x6c,
+	0x18, 0x0b, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, 0x62, 0x6f, 0x6f, 0x6c, 0x12, 0x14, 0x0a, 0x05,
+	0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x05, 0x62, 0x79, 0x74,
+	0x65, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x18, 0x0d, 0x20, 0x01,
+	0x28, 0x01, 0x52, 0x06, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x12, 0x31, 0x0a, 0x04, 0x65, 0x6e,
+	0x75, 0x6d, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x1d, 0x2e, 0x41, 0x6c, 0x6c, 0x54, 0x68,
+	0x65, 0x54, 0x79, 0x70, 0x65, 0x73, 0x4e, 0x6f, 0x41, 0x6e, 0x79, 0x2e, 0x45, 0x78, 0x61, 0x6d,
+	0x70, 0x6c, 0x65, 0x45, 0x6e, 0x75, 0x6d, 0x52, 0x04, 0x65, 0x6e, 0x75, 0x6d, 0x12, 0x29, 0x0a,
+	0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18, 0x0f, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0f,
+	0x2e, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x52,
+	0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x12, 0x22, 0x0a, 0x0b, 0x6f, 0x6e, 0x65, 0x6f,
+	0x66, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x18, 0x10, 0x20, 0x01, 0x28, 0x09, 0x48, 0x00, 0x52,
+	0x0b, 0x6f, 0x6e, 0x65, 0x6f, 0x66, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x12, 0x35, 0x0a, 0x0c,
+	0x6f, 0x6e, 0x65, 0x6f, 0x66, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18, 0x11, 0x20, 0x01,
+	0x28, 0x0b, 0x32, 0x0f, 0x2e, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x4d, 0x65, 0x73, 0x73,
+	0x61, 0x67, 0x65, 0x48, 0x00, 0x52, 0x0c, 0x6f, 0x6e, 0x65, 0x6f, 0x66, 0x6d, 0x65, 0x73, 0x73,
+	0x61, 0x67, 0x65, 0x12, 0x3f, 0x0a, 0x0a, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x6d, 0x61,
+	0x70, 0x18, 0x13, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x20, 0x2e, 0x41, 0x6c, 0x6c, 0x54, 0x68, 0x65,
+	0x54, 0x79, 0x70, 0x65, 0x73, 0x4e, 0x6f, 0x41, 0x6e, 0x79, 0x2e, 0x53, 0x69, 0x6d, 0x70, 0x6c,
+	0x65, 0x4d, 0x61, 0x70, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x09, 0x73, 0x69, 0x6d, 0x70, 0x6c,
+	0x65, 0x4d, 0x61, 0x70, 0x12, 0x42, 0x0a, 0x0b, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x5f,
+	0x6d, 0x61, 0x70, 0x18, 0x14, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x21, 0x2e, 0x41, 0x6c, 0x6c, 0x54,
+	0x68, 0x65, 0x54, 0x79, 0x70, 0x65, 0x73, 0x4e, 0x6f, 0x41, 0x6e, 0x79, 0x2e, 0x43, 0x6f, 0x6d,
+	0x70, 0x6c, 0x65, 0x78, 0x4d, 0x61, 0x70, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0a, 0x63, 0x6f,
+	0x6d, 0x70, 0x6c, 0x65, 0x78, 0x4d, 0x61, 0x70, 0x12, 0x1f, 0x0a, 0x0b, 0x73, 0x69, 0x6d, 0x70,
+	0x6c, 0x65, 0x5f, 0x6c, 0x69, 0x73, 0x74, 0x18, 0x15, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0a, 0x73,
+	0x69, 0x6d, 0x70, 0x6c, 0x65, 0x4c, 0x69, 0x73, 0x74, 0x12, 0x32, 0x0a, 0x0c, 0x63, 0x6f, 0x6d,
+	0x70, 0x6c, 0x65, 0x78, 0x5f, 0x6c, 0x69, 0x73, 0x74, 0x18, 0x16, 0x20, 0x03, 0x28, 0x0b, 0x32,
+	0x0f, 0x2e, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
+	0x52, 0x0b, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x4c, 0x69, 0x73, 0x74, 0x1a, 0x3c, 0x0a,
+	0x0e, 0x53, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x4d, 0x61, 0x70, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12,
+	0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x6b, 0x65,
+	0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09,
+	0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x1a, 0x4e, 0x0a, 0x0f, 0x43,
+	0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x4d, 0x61, 0x70, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10,
+	0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79,
+	0x12, 0x25, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32,
+	0x0f, 0x2e, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
+	0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x29, 0x0a, 0x0b, 0x45,
+	0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x45, 0x6e, 0x75, 0x6d, 0x12, 0x0c, 0x0a, 0x08, 0x4f, 0x50,
+	0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x30, 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, 0x4f, 0x50, 0x54, 0x49,
+	0x4f, 0x4e, 0x5f, 0x31, 0x10, 0x01, 0x42, 0x07, 0x0a, 0x05, 0x6f, 0x6e, 0x65, 0x6f, 0x66, 0x22,
+	0x66, 0x0a, 0x0c, 0x53, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x4e, 0x65, 0x73, 0x74, 0x65, 0x64, 0x12,
+	0x2a, 0x0a, 0x08, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x61, 0x18, 0x01, 0x20, 0x03, 0x28,
+	0x0b, 0x32, 0x0f, 0x2e, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x4d, 0x65, 0x73, 0x73, 0x61,
+	0x67, 0x65, 0x52, 0x07, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x41, 0x12, 0x2a, 0x0a, 0x08, 0x73,
+	0x69, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x62, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0f, 0x2e,
+	0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x52, 0x07,
+	0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x42, 0x22, 0x9b, 0x01, 0x0a, 0x0d, 0x43, 0x6f, 0x6d, 0x70,
+	0x6c, 0x65, 0x78, 0x4e, 0x65, 0x73, 0x74, 0x65, 0x64, 0x12, 0x44, 0x0a, 0x16, 0x61, 0x6c, 0x6c,
+	0x5f, 0x74, 0x68, 0x65, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x73, 0x5f, 0x6e, 0x6f, 0x5f, 0x61, 0x6e,
+	0x79, 0x5f, 0x61, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x11, 0x2e, 0x41, 0x6c, 0x6c, 0x54,
+	0x68, 0x65, 0x54, 0x79, 0x70, 0x65, 0x73, 0x4e, 0x6f, 0x41, 0x6e, 0x79, 0x52, 0x11, 0x61, 0x6c,
+	0x6c, 0x54, 0x68, 0x65, 0x54, 0x79, 0x70, 0x65, 0x73, 0x4e, 0x6f, 0x41, 0x6e, 0x79, 0x41, 0x12,
+	0x44, 0x0a, 0x16, 0x61, 0x6c, 0x6c, 0x5f, 0x74, 0x68, 0x65, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x73,
+	0x5f, 0x6e, 0x6f, 0x5f, 0x61, 0x6e, 0x79, 0x5f, 0x62, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32,
+	0x11, 0x2e, 0x41, 0x6c, 0x6c, 0x54, 0x68, 0x65, 0x54, 0x79, 0x70, 0x65, 0x73, 0x4e, 0x6f, 0x41,
+	0x6e, 0x79, 0x52, 0x11, 0x61, 0x6c, 0x6c, 0x54, 0x68, 0x65, 0x54, 0x79, 0x70, 0x65, 0x73, 0x4e,
+	0x6f, 0x41, 0x6e, 0x79, 0x42, 0x22, 0x77, 0x0a, 0x0a, 0x44, 0x65, 0x65, 0x70, 0x4e, 0x65, 0x73,
+	0x74, 0x65, 0x64, 0x12, 0x35, 0x0a, 0x0e, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x5f, 0x6e,
+	0x65, 0x73, 0x74, 0x65, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0e, 0x2e, 0x43, 0x6f,
+	0x6d, 0x70, 0x6c, 0x65, 0x78, 0x4e, 0x65, 0x73, 0x74, 0x65, 0x64, 0x52, 0x0d, 0x63, 0x6f, 0x6d,
+	0x70, 0x6c, 0x65, 0x78, 0x4e, 0x65, 0x73, 0x74, 0x65, 0x64, 0x12, 0x32, 0x0a, 0x0d, 0x73, 0x69,
+	0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x6e, 0x65, 0x73, 0x74, 0x65, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28,
+	0x0b, 0x32, 0x0d, 0x2e, 0x53, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x4e, 0x65, 0x73, 0x74, 0x65, 0x64,
+	0x52, 0x0c, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x4e, 0x65, 0x73, 0x74, 0x65, 0x64, 0x42, 0x11,
+	0x5a, 0x0f, 0x2e, 0x2e, 0x2f, 0x75, 0x74, 0x69, 0x6c, 0x5f, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+	0x65, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
 }
 
 var (
@@ -456,30 +980,50 @@ func file_messages_types_proto_rawDescGZIP() []byte {
 	return file_messages_types_proto_rawDescData
 }
 
-var file_messages_types_proto_enumTypes = make([]protoimpl.EnumInfo, 1)
-var file_messages_types_proto_msgTypes = make([]protoimpl.MessageInfo, 4)
+var file_messages_types_proto_enumTypes = make([]protoimpl.EnumInfo, 2)
+var file_messages_types_proto_msgTypes = make([]protoimpl.MessageInfo, 10)
 var file_messages_types_proto_goTypes = []interface{}{
-	(AllTheTypes_ExampleEnum)(0), // 0: AllTheTypes.ExampleEnum
-	(*ExampleMessage)(nil),       // 1: ExampleMessage
-	(*AllTheTypes)(nil),          // 2: AllTheTypes
-	nil,                          // 3: AllTheTypes.SimpleMapEntry
-	nil,                          // 4: AllTheTypes.ComplexMapEntry
-	(*anypb.Any)(nil),            // 5: google.protobuf.Any
+	(AllTheTypes_ExampleEnum)(0),      // 0: AllTheTypes.ExampleEnum
+	(AllTheTypesNoAny_ExampleEnum)(0), // 1: AllTheTypesNoAny.ExampleEnum
+	(*ExampleMessage)(nil),            // 2: ExampleMessage
+	(*AllTheTypes)(nil),               // 3: AllTheTypes
+	(*AllTheTypesNoAny)(nil),          // 4: AllTheTypesNoAny
+	(*SimpleNested)(nil),              // 5: SimpleNested
+	(*ComplexNested)(nil),             // 6: ComplexNested
+	(*DeepNested)(nil),                // 7: DeepNested
+	nil,                               // 8: AllTheTypes.SimpleMapEntry
+	nil,                               // 9: AllTheTypes.ComplexMapEntry
+	nil,                               // 10: AllTheTypesNoAny.SimpleMapEntry
+	nil,                               // 11: AllTheTypesNoAny.ComplexMapEntry
+	(*anypb.Any)(nil),                 // 12: google.protobuf.Any
 }
 var file_messages_types_proto_depIdxs = []int32{
-	0, // 0: AllTheTypes.enum:type_name -> AllTheTypes.ExampleEnum
-	1, // 1: AllTheTypes.message:type_name -> ExampleMessage
-	1, // 2: AllTheTypes.oneofmessage:type_name -> ExampleMessage
-	5, // 3: AllTheTypes.any:type_name -> google.protobuf.Any
-	3, // 4: AllTheTypes.simple_map:type_name -> AllTheTypes.SimpleMapEntry
-	4, // 5: AllTheTypes.complex_map:type_name -> AllTheTypes.ComplexMapEntry
-	1, // 6: AllTheTypes.complex_list:type_name -> ExampleMessage
-	1, // 7: AllTheTypes.ComplexMapEntry.value:type_name -> ExampleMessage
-	8, // [8:8] is the sub-list for method output_type
-	8, // [8:8] is the sub-list for method input_type
-	8, // [8:8] is the sub-list for extension type_name
-	8, // [8:8] is the sub-list for extension extendee
-	0, // [0:8] is the sub-list for field type_name
+	0,  // 0: AllTheTypes.enum:type_name -> AllTheTypes.ExampleEnum
+	2,  // 1: AllTheTypes.message:type_name -> ExampleMessage
+	2,  // 2: AllTheTypes.oneofmessage:type_name -> ExampleMessage
+	12, // 3: AllTheTypes.any:type_name -> google.protobuf.Any
+	8,  // 4: AllTheTypes.simple_map:type_name -> AllTheTypes.SimpleMapEntry
+	9,  // 5: AllTheTypes.complex_map:type_name -> AllTheTypes.ComplexMapEntry
+	2,  // 6: AllTheTypes.complex_list:type_name -> ExampleMessage
+	1,  // 7: AllTheTypesNoAny.enum:type_name -> AllTheTypesNoAny.ExampleEnum
+	2,  // 8: AllTheTypesNoAny.message:type_name -> ExampleMessage
+	2,  // 9: AllTheTypesNoAny.oneofmessage:type_name -> ExampleMessage
+	10, // 10: AllTheTypesNoAny.simple_map:type_name -> AllTheTypesNoAny.SimpleMapEntry
+	11, // 11: AllTheTypesNoAny.complex_map:type_name -> AllTheTypesNoAny.ComplexMapEntry
+	2,  // 12: AllTheTypesNoAny.complex_list:type_name -> ExampleMessage
+	2,  // 13: SimpleNested.simple_a:type_name -> ExampleMessage
+	2,  // 14: SimpleNested.simple_b:type_name -> ExampleMessage
+	4,  // 15: ComplexNested.all_the_types_no_any_a:type_name -> AllTheTypesNoAny
+	4,  // 16: ComplexNested.all_the_types_no_any_b:type_name -> AllTheTypesNoAny
+	6,  // 17: DeepNested.complex_nested:type_name -> ComplexNested
+	5,  // 18: DeepNested.simple_nested:type_name -> SimpleNested
+	2,  // 19: AllTheTypes.ComplexMapEntry.value:type_name -> ExampleMessage
+	2,  // 20: AllTheTypesNoAny.ComplexMapEntry.value:type_name -> ExampleMessage
+	21, // [21:21] is the sub-list for method output_type
+	21, // [21:21] is the sub-list for method input_type
+	21, // [21:21] is the sub-list for extension type_name
+	21, // [21:21] is the sub-list for extension extendee
+	0,  // [0:21] is the sub-list for field type_name
 }
 
 func init() { file_messages_types_proto_init() }
@@ -512,18 +1056,70 @@ func file_messages_types_proto_init() {
 				return nil
 			}
 		}
+		file_messages_types_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*AllTheTypesNoAny); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
+		file_messages_types_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*SimpleNested); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
+		file_messages_types_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*ComplexNested); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
+		file_messages_types_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*DeepNested); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
 	}
 	file_messages_types_proto_msgTypes[1].OneofWrappers = []interface{}{
 		(*AllTheTypes_Oneofstring)(nil),
 		(*AllTheTypes_Oneofmessage)(nil),
 	}
+	file_messages_types_proto_msgTypes[2].OneofWrappers = []interface{}{
+		(*AllTheTypesNoAny_Oneofstring)(nil),
+		(*AllTheTypesNoAny_Oneofmessage)(nil),
+	}
 	type x struct{}
 	out := protoimpl.TypeBuilder{
 		File: protoimpl.DescBuilder{
 			GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
 			RawDescriptor: file_messages_types_proto_rawDesc,
-			NumEnums:      1,
-			NumMessages:   4,
+			NumEnums:      2,
+			NumMessages:   10,
 			NumExtensions: 0,
 			NumServices:   0,
 		},

From a380d695a6672f6981d1fe36cd1acc8d68ee9c3e Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Tue, 20 Aug 2024 01:27:45 +0800
Subject: [PATCH 036/157] GH-43733: [C++] Fix Scalar boolean handling in row
 encoder (#43734)

### Rationale for this change

See https://github.com/apache/arrow/issues/43733

### What changes are included in this PR?

Separate Null and Valid handling when BooleanKeyEncoder::Encode meets a Null

This patch also does a migration:
* row_encoder.cc -> row_encoder_internal.cc
* move row_encoder_internal{.cc|.h} from `compute/kernel` to `compute/row`

### Are these changes tested?

Yes

### Are there any user-facing changes?

No

* GitHub Issue: #43733

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/CMakeLists.txt                  |  2 +-
 cpp/src/arrow/acero/asof_join_node_test.cc    |  2 +-
 cpp/src/arrow/acero/hash_join.cc              |  2 +-
 cpp/src/arrow/acero/hash_join_benchmark.cc    |  2 +-
 cpp/src/arrow/acero/hash_join_dict.h          |  2 +-
 cpp/src/arrow/acero/hash_join_node_test.cc    |  2 +-
 cpp/src/arrow/acero/swiss_join.cc             |  2 +-
 cpp/src/arrow/acero/swiss_join_internal.h     |  2 +-
 cpp/src/arrow/acero/tpch_node_test.cc         |  2 +-
 cpp/src/arrow/compute/CMakeLists.txt          |  1 +
 .../arrow/compute/kernels/hash_aggregate.cc   |  2 +-
 cpp/src/arrow/compute/row/grouper.cc          |  2 +-
 .../row_encoder_internal.cc}                  | 41 ++++++-----
 .../{kernels => row}/row_encoder_internal.h   | 14 ++--
 .../compute/row/row_encoder_internal_test.cc  | 68 +++++++++++++++++++
 cpp/src/arrow/compute/row/row_test.cc         |  2 +-
 16 files changed, 111 insertions(+), 37 deletions(-)
 rename cpp/src/arrow/compute/{kernels/row_encoder.cc => row/row_encoder_internal.cc} (93%)
 rename cpp/src/arrow/compute/{kernels => row}/row_encoder_internal.h (96%)
 create mode 100644 cpp/src/arrow/compute/row/row_encoder_internal_test.cc

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 67d2c19f98a..fb785e1e957 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -723,7 +723,6 @@ set(ARROW_COMPUTE_SRCS
     compute/ordering.cc
     compute/registry.cc
     compute/kernels/codegen_internal.cc
-    compute/kernels/row_encoder.cc
     compute/kernels/ree_util_internal.cc
     compute/kernels/scalar_cast_boolean.cc
     compute/kernels/scalar_cast_dictionary.cc
@@ -742,6 +741,7 @@ set(ARROW_COMPUTE_SRCS
     compute/row/encode_internal.cc
     compute/row/compare_internal.cc
     compute/row/grouper.cc
+    compute/row/row_encoder_internal.cc
     compute/row/row_internal.cc
     compute/util.cc
     compute/util_internal.cc)
diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc
index 051e280a4c5..555f580028f 100644
--- a/cpp/src/arrow/acero/asof_join_node_test.cc
+++ b/cpp/src/arrow/acero/asof_join_node_test.cc
@@ -41,8 +41,8 @@
 #include "arrow/acero/util.h"
 #include "arrow/api.h"
 #include "arrow/compute/api_scalar.h"
-#include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/kernels/test_util.h"
+#include "arrow/compute/row/row_encoder_internal.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/matchers.h"
 #include "arrow/testing/random.h"
diff --git a/cpp/src/arrow/acero/hash_join.cc b/cpp/src/arrow/acero/hash_join.cc
index 5aa70a23f7c..ddcd2a09957 100644
--- a/cpp/src/arrow/acero/hash_join.cc
+++ b/cpp/src/arrow/acero/hash_join.cc
@@ -27,8 +27,8 @@
 
 #include "arrow/acero/hash_join_dict.h"
 #include "arrow/acero/task_util.h"
-#include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/row/encode_internal.h"
+#include "arrow/compute/row/row_encoder_internal.h"
 #include "arrow/util/tracing_internal.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc
index 1f8e02e9f0f..470960b1c50 100644
--- a/cpp/src/arrow/acero/hash_join_benchmark.cc
+++ b/cpp/src/arrow/acero/hash_join_benchmark.cc
@@ -23,7 +23,7 @@
 #include "arrow/acero/test_util_internal.h"
 #include "arrow/acero/util.h"
 #include "arrow/api.h"
-#include "arrow/compute/kernels/row_encoder_internal.h"
+#include "arrow/compute/row/row_encoder_internal.h"
 #include "arrow/testing/random.h"
 #include "arrow/util/thread_pool.h"
 
diff --git a/cpp/src/arrow/acero/hash_join_dict.h b/cpp/src/arrow/acero/hash_join_dict.h
index c7d8d785d07..02454a71462 100644
--- a/cpp/src/arrow/acero/hash_join_dict.h
+++ b/cpp/src/arrow/acero/hash_join_dict.h
@@ -22,7 +22,7 @@
 
 #include "arrow/acero/schema_util.h"
 #include "arrow/compute/exec.h"
-#include "arrow/compute/kernels/row_encoder_internal.h"
+#include "arrow/compute/row/row_encoder_internal.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc
index 88f9a9e71b7..9065e286a22 100644
--- a/cpp/src/arrow/acero/hash_join_node_test.cc
+++ b/cpp/src/arrow/acero/hash_join_node_test.cc
@@ -26,9 +26,9 @@
 #include "arrow/acero/test_util_internal.h"
 #include "arrow/acero/util.h"
 #include "arrow/api.h"
-#include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/kernels/test_util.h"
 #include "arrow/compute/light_array_internal.h"
+#include "arrow/compute/row/row_encoder_internal.h"
 #include "arrow/testing/extension_type.h"
 #include "arrow/testing/generator.h"
 #include "arrow/testing/gtest_util.h"
diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc
index 40a4b5886e4..4d0c8187ac6 100644
--- a/cpp/src/arrow/acero/swiss_join.cc
+++ b/cpp/src/arrow/acero/swiss_join.cc
@@ -24,10 +24,10 @@
 #include "arrow/acero/swiss_join_internal.h"
 #include "arrow/acero/util.h"
 #include "arrow/array/util.h"  // MakeArrayFromScalar
-#include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/key_hash_internal.h"
 #include "arrow/compute/row/compare_internal.h"
 #include "arrow/compute/row/encode_internal.h"
+#include "arrow/compute/row/row_encoder_internal.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/tracing_internal.h"
diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h
index dceb74abe4f..4d749c1c529 100644
--- a/cpp/src/arrow/acero/swiss_join_internal.h
+++ b/cpp/src/arrow/acero/swiss_join_internal.h
@@ -22,10 +22,10 @@
 #include "arrow/acero/partition_util.h"
 #include "arrow/acero/schema_util.h"
 #include "arrow/acero/task_util.h"
-#include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/key_map_internal.h"
 #include "arrow/compute/light_array_internal.h"
 #include "arrow/compute/row/encode_internal.h"
+#include "arrow/compute/row/row_encoder_internal.h"
 
 namespace arrow {
 
diff --git a/cpp/src/arrow/acero/tpch_node_test.cc b/cpp/src/arrow/acero/tpch_node_test.cc
index 076bcf634a6..17fb43452bc 100644
--- a/cpp/src/arrow/acero/tpch_node_test.cc
+++ b/cpp/src/arrow/acero/tpch_node_test.cc
@@ -27,8 +27,8 @@
 #include "arrow/acero/test_util_internal.h"
 #include "arrow/acero/tpch_node.h"
 #include "arrow/acero/util.h"
-#include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/kernels/test_util.h"
+#include "arrow/compute/row/row_encoder_internal.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/matchers.h"
 #include "arrow/testing/random.h"
diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt
index e20b45897db..aa2a2d4e9af 100644
--- a/cpp/src/arrow/compute/CMakeLists.txt
+++ b/cpp/src/arrow/compute/CMakeLists.txt
@@ -92,6 +92,7 @@ add_arrow_test(internals_test
                key_hash_test.cc
                row/compare_test.cc
                row/grouper_test.cc
+               row/row_encoder_internal_test.cc
                row/row_test.cc
                util_internal_test.cc)
 
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 54cd695421a..4bf6a6106df 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -33,9 +33,9 @@
 #include "arrow/compute/kernels/aggregate_internal.h"
 #include "arrow/compute/kernels/aggregate_var_std_internal.h"
 #include "arrow/compute/kernels/common_internal.h"
-#include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/kernels/util_internal.h"
 #include "arrow/compute/row/grouper.h"
+#include "arrow/compute/row/row_encoder_internal.h"
 #include "arrow/record_batch.h"
 #include "arrow/stl_allocator.h"
 #include "arrow/type_traits.h"
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc
index 45b9ad5971e..5889f94d96c 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -25,12 +25,12 @@
 
 #include "arrow/compute/api_vector.h"
 #include "arrow/compute/function.h"
-#include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/key_hash_internal.h"
 #include "arrow/compute/light_array_internal.h"
 #include "arrow/compute/registry.h"
 #include "arrow/compute/row/compare_internal.h"
 #include "arrow/compute/row/grouper_internal.h"
+#include "arrow/compute/row/row_encoder_internal.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bitmap_ops.h"
diff --git a/cpp/src/arrow/compute/kernels/row_encoder.cc b/cpp/src/arrow/compute/row/row_encoder_internal.cc
similarity index 93%
rename from cpp/src/arrow/compute/kernels/row_encoder.cc
rename to cpp/src/arrow/compute/row/row_encoder_internal.cc
index 8224eaa6d63..414cc6793a5 100644
--- a/cpp/src/arrow/compute/kernels/row_encoder.cc
+++ b/cpp/src/arrow/compute/row/row_encoder_internal.cc
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/compute/kernels/row_encoder_internal.h"
+#include "arrow/compute/row/row_encoder_internal.h"
 
 #include "arrow/util/bitmap_writer.h"
 #include "arrow/util/logging.h"
@@ -75,26 +75,31 @@ void BooleanKeyEncoder::AddLengthNull(int32_t* length) {
 
 Status BooleanKeyEncoder::Encode(const ExecValue& data, int64_t batch_length,
                                  uint8_t** encoded_bytes) {
+  auto handle_next_valid_value = [&encoded_bytes](bool value) {
+    auto& encoded_ptr = *encoded_bytes++;
+    *encoded_ptr++ = kValidByte;
+    *encoded_ptr++ = value;
+  };
+  auto handle_next_null_value = [&encoded_bytes]() {
+    auto& encoded_ptr = *encoded_bytes++;
+    *encoded_ptr++ = kNullByte;
+    *encoded_ptr++ = 0;
+  };
+
   if (data.is_array()) {
-    VisitArraySpanInline<BooleanType>(
-        data.array,
-        [&](bool value) {
-          auto& encoded_ptr = *encoded_bytes++;
-          *encoded_ptr++ = kValidByte;
-          *encoded_ptr++ = value;
-        },
-        [&] {
-          auto& encoded_ptr = *encoded_bytes++;
-          *encoded_ptr++ = kNullByte;
-          *encoded_ptr++ = 0;
-        });
+    VisitArraySpanInline<BooleanType>(data.array, handle_next_valid_value,
+                                      handle_next_null_value);
   } else {
     const auto& scalar = data.scalar_as<BooleanScalar>();
-    bool value = scalar.is_valid && scalar.value;
-    for (int64_t i = 0; i < batch_length; i++) {
-      auto& encoded_ptr = *encoded_bytes++;
-      *encoded_ptr++ = kValidByte;
-      *encoded_ptr++ = value;
+    if (!scalar.is_valid) {
+      for (int64_t i = 0; i < batch_length; i++) {
+        handle_next_null_value();
+      }
+    } else {
+      const bool value = scalar.value;
+      for (int64_t i = 0; i < batch_length; i++) {
+        handle_next_valid_value(value);
+      }
     }
   }
   return Status::OK();
diff --git a/cpp/src/arrow/compute/kernels/row_encoder_internal.h b/cpp/src/arrow/compute/row/row_encoder_internal.h
similarity index 96%
rename from cpp/src/arrow/compute/kernels/row_encoder_internal.h
rename to cpp/src/arrow/compute/row/row_encoder_internal.h
index 9bf7c1d1c4f..60eb14af504 100644
--- a/cpp/src/arrow/compute/kernels/row_encoder_internal.h
+++ b/cpp/src/arrow/compute/row/row_encoder_internal.h
@@ -29,7 +29,7 @@ using internal::checked_cast;
 namespace compute {
 namespace internal {
 
-struct KeyEncoder {
+struct ARROW_EXPORT KeyEncoder {
   // the first byte of an encoded key is used to indicate nullity
   static constexpr bool kExtraByteForNull = true;
 
@@ -60,7 +60,7 @@ struct KeyEncoder {
   }
 };
 
-struct BooleanKeyEncoder : KeyEncoder {
+struct ARROW_EXPORT BooleanKeyEncoder : KeyEncoder {
   static constexpr int kByteWidth = 1;
 
   void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override;
@@ -76,7 +76,7 @@ struct BooleanKeyEncoder : KeyEncoder {
                                             MemoryPool* pool) override;
 };
 
-struct FixedWidthKeyEncoder : KeyEncoder {
+struct ARROW_EXPORT FixedWidthKeyEncoder : KeyEncoder {
   explicit FixedWidthKeyEncoder(std::shared_ptr<DataType> type)
       : type_(std::move(type)),
         byte_width_(checked_cast<const FixedWidthType&>(*type_).bit_width() / 8) {}
@@ -97,7 +97,7 @@ struct FixedWidthKeyEncoder : KeyEncoder {
   int byte_width_;
 };
 
-struct DictionaryKeyEncoder : FixedWidthKeyEncoder {
+struct ARROW_EXPORT DictionaryKeyEncoder : FixedWidthKeyEncoder {
   DictionaryKeyEncoder(std::shared_ptr<DataType> type, MemoryPool* pool)
       : FixedWidthKeyEncoder(std::move(type)), pool_(pool) {}
 
@@ -112,7 +112,7 @@ struct DictionaryKeyEncoder : FixedWidthKeyEncoder {
 };
 
 template <typename T>
-struct VarLengthKeyEncoder : KeyEncoder {
+struct ARROW_EXPORT VarLengthKeyEncoder : KeyEncoder {
   using Offset = typename T::offset_type;
 
   void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override {
@@ -232,7 +232,7 @@ struct VarLengthKeyEncoder : KeyEncoder {
   std::shared_ptr<DataType> type_;
 };
 
-struct NullKeyEncoder : KeyEncoder {
+struct ARROW_EXPORT NullKeyEncoder : KeyEncoder {
   void AddLength(const ExecValue&, int64_t batch_length, int32_t* lengths) override {}
 
   void AddLengthNull(int32_t* length) override {}
@@ -274,7 +274,7 @@ class ARROW_EXPORT RowEncoder {
   }
 
  private:
-  ExecContext* ctx_;
+  ExecContext* ctx_{nullptr};
   std::vector<std::shared_ptr<KeyEncoder>> encoders_;
   std::vector<int32_t> offsets_;
   std::vector<uint8_t> bytes_;
diff --git a/cpp/src/arrow/compute/row/row_encoder_internal_test.cc b/cpp/src/arrow/compute/row/row_encoder_internal_test.cc
new file mode 100644
index 00000000000..78839d1ead5
--- /dev/null
+++ b/cpp/src/arrow/compute/row/row_encoder_internal_test.cc
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "arrow/compute/row/row_encoder_internal.h"
+
+#include "arrow/array/validate.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow::compute::internal {
+
+// GH-43733: Test that the key encoder can handle boolean scalar values well.
+TEST(TestKeyEncoder, BooleanScalar) {
+  for (auto scalar : {BooleanScalar{}, BooleanScalar{true}, BooleanScalar{false}}) {
+    BooleanKeyEncoder key_encoder;
+    SCOPED_TRACE("scalar " + scalar.ToString());
+    constexpr int64_t kBatchLength = 10;
+    std::array<int32_t, kBatchLength> lengths{};
+    key_encoder.AddLength(ExecValue{&scalar}, kBatchLength, lengths.data());
+    // Check that the lengths are all 2.
+    constexpr int32_t kPayloadWidth =
+        BooleanKeyEncoder::kByteWidth + BooleanKeyEncoder::kExtraByteForNull;
+    for (int i = 0; i < kBatchLength; ++i) {
+      ASSERT_EQ(kPayloadWidth, lengths[i]);
+    }
+    std::array<std::array<uint8_t, kPayloadWidth>, kBatchLength> payloads{};
+    std::array<uint8_t*, kBatchLength> payload_ptrs{};
+    // Reset the payload pointers to point to the beginning of each payload.
+    // This is necessary because the key encoder may have modified the pointers.
+    auto reset_payload_ptrs = [&payload_ptrs, &payloads]() {
+      std::transform(payloads.begin(), payloads.end(), payload_ptrs.begin(),
+                     [](auto& payload) -> uint8_t* { return payload.data(); });
+    };
+    reset_payload_ptrs();
+    ASSERT_OK(key_encoder.Encode(ExecValue{&scalar}, kBatchLength, payload_ptrs.data()));
+    reset_payload_ptrs();
+    ASSERT_OK_AND_ASSIGN(auto array_data,
+                         key_encoder.Decode(payload_ptrs.data(), kBatchLength,
+                                            ::arrow::default_memory_pool()));
+    ASSERT_EQ(kBatchLength, array_data->length);
+    auto boolean_array = std::make_shared<BooleanArray>(array_data);
+    ASSERT_OK(arrow::internal::ValidateArrayFull(*array_data));
+    ASSERT_OK_AND_ASSIGN(
+        auto expected_array,
+        MakeArrayFromScalar(scalar, kBatchLength, ::arrow::default_memory_pool()));
+    AssertArraysEqual(*expected_array, *boolean_array);
+  }
+}
+
+}  // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc
index 6aed9e43278..5057ce91b5b 100644
--- a/cpp/src/arrow/compute/row/row_test.cc
+++ b/cpp/src/arrow/compute/row/row_test.cc
@@ -155,7 +155,7 @@ TEST(RowTableLarge, LARGE_MEMORY_TEST(Encode)) {
       auto value, ::arrow::gen::Constant(
                       std::make_shared<BinaryScalar>(std::string(length_per_binary, 'X')))
                       ->Generate(1));
-  values.push_back(std::move(value));
+  values.emplace_back(std::move(value));
   ExecBatch batch = ExecBatch(std::move(values), 1);
   ASSERT_OK(ColumnArraysFromExecBatch(batch, &columns));
 

From 9d4dcc903e84732a6e14d61aece1f9a1d096f7c9 Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Mon, 19 Aug 2024 14:01:57 -0500
Subject: [PATCH 037/157] GH-38847: [Documentation][C++] Explicitly note that
 compute is optional (#43629)

### Rationale for this change

A user didn't know from reading just the compute documentation that compute is an optional feature. We can make that explicit

### What changes are included in this PR?

Added a cross-reference to the optional features section

### Are these changes tested?

No

### Are there any user-facing changes?

No

* GitHub Issue: #38847

Authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 docs/source/cpp/tutorials/compute_tutorial.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/cpp/tutorials/compute_tutorial.rst b/docs/source/cpp/tutorials/compute_tutorial.rst
index a650865d75c..72ebc35650d 100644
--- a/docs/source/cpp/tutorials/compute_tutorial.rst
+++ b/docs/source/cpp/tutorials/compute_tutorial.rst
@@ -39,7 +39,9 @@ Pre-requisites
 
 Before continuing, make sure you have:
 
-1. An Arrow installation, which you can set up here: :doc:`/cpp/build_system`
+1. An Arrow installation, which you can set up here: :doc:`/cpp/build_system`.
+   If you're compiling Arrow yourself, be sure you compile with the compute module
+   enabled (i.e., ``-DARROW_COMPUTE=ON``), see :ref:`cpp_build_optional_components`.
 
 2. An understanding of basic Arrow data structures from :doc:`/cpp/tutorials/basic_arrow`
 
@@ -50,7 +52,7 @@ Before running some computations, we need to fill in a couple gaps:
 
 1. We need to include necessary headers.
 
-2. ``A main()`` is needed to glue things together.
+2. A ``main()`` is needed to glue things together.
 
 3. We need data to play with.
 

From 364e01441a1d437c4e833fc00ec76af3a6f342d7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 20 Aug 2024 09:48:06 +0900
Subject: [PATCH 038/157] MINOR: [Java] Bump org.apache.avro:avro from 1.11.3
 to 1.12.0 in /java (#43564)

Bumps org.apache.avro:avro from 1.11.3 to 1.12.0.

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.avro:avro&package-manager=maven&previous-version=1.11.3&new-version=1.12.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/dataset/pom.xml | 2 +-
 java/pom.xml         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml
index 74071a6c305..f3384fabbed 100644
--- a/java/dataset/pom.xml
+++ b/java/dataset/pom.xml
@@ -33,7 +33,7 @@ under the License.
   <properties>
     <arrow.cpp.build.dir>../../../cpp/release-build/</arrow.cpp.build.dir>
     <parquet.version>1.14.1</parquet.version>
-    <avro.version>1.11.3</avro.version>
+    <avro.version>1.12.0</avro.version>
   </properties>
 
   <dependencies>
diff --git a/java/pom.xml b/java/pom.xml
index 0466cad9237..45e9f07174b 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -102,7 +102,7 @@ under the License.
     <dep.jackson-bom.version>2.17.2</dep.jackson-bom.version>
     <dep.hadoop.version>3.4.0</dep.hadoop.version>
     <dep.fbs.version>24.3.25</dep.fbs.version>
-    <dep.avro.version>1.11.3</dep.avro.version>
+    <dep.avro.version>1.12.0</dep.avro.version>
     <arrow.vector.classifier></arrow.vector.classifier>
     <forkCount>2</forkCount>
     <checkstyle.version>10.17.0</checkstyle.version>

From b5726ea59e9e92dd99c687faf07e4c797b02ce7b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 20 Aug 2024 09:51:27 +0900
Subject: [PATCH 039/157] MINOR: [Java] Bump
 org.apache.commons:commons-compress from 1.26.2 to 1.27.0 in /java (#43653)

Bumps org.apache.commons:commons-compress from 1.26.2 to 1.27.0.

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.commons:commons-compress&package-manager=maven&previous-version=1.26.2&new-version=1.27.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/compression/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/compression/pom.xml b/java/compression/pom.xml
index 8774f7cabde..a1f2bc861da 100644
--- a/java/compression/pom.xml
+++ b/java/compression/pom.xml
@@ -50,7 +50,7 @@ under the License.
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-compress</artifactId>
-      <version>1.26.2</version>
+      <version>1.27.0</version>
     </dependency>
     <dependency>
       <groupId>com.github.luben</groupId>

From 944d13660c952c05deb58a9a74f562947ef7ec16 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 20 Aug 2024 12:18:10 +0900
Subject: [PATCH 040/157] MINOR: [Java] Bump error_prone_core.version from
 2.29.2 to 2.30.0 in /java (#43656)

Bumps `error_prone_core.version` from 2.29.2 to 2.30.0.
Updates `com.google.errorprone:error_prone_annotations` from 2.29.2 to 2.30.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/google/error-prone/releases">com.google.errorprone:error_prone_annotations's releases</a>.</em></p>
<blockquote>
<h2>Error Prone 2.30.0</h2>
<p>New checks:</p>
<ul>
<li><a href="https://errorprone.info/bugpattern/AutoValueBoxedValues"><code>AutoValueBoxedValues</code></a>: Detects unnecessary boxing in AutoValue classes.</li>
<li><a href="https://errorprone.info/bugpattern/DefaultLocale"><code>DefaultLocale</code></a>: Detects implicit use of the JVM default locale, which can result in differing behaviour between JVM executions.</li>
<li><a href="https://errorprone.info/bugpattern/UnnecessaryBreakInSwitch"><code>UnnecessaryBreakInSwitch</code></a>: Remove unnecessary <code>break</code> statements in <code>-&gt;</code> switches.</li>
</ul>
<p>Closed issues: <a href="https://redirect.github.com/google/error-prone/issues/632">#632</a>, <a href="https://redirect.github.com/google/error-prone/issues/4487">#4487</a></p>
<p>Full changelog: <a href="https://github.com/google/error-prone/compare/v2.29.2...v2.30.0">https://github.com/google/error-prone/compare/v2.29.2...v2.30.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/google/error-prone/commit/5ada179028452868623a1aa8f11eee2996886454"><code>5ada179</code></a> Release Error Prone 2.30.0</li>
<li><a href="https://github.com/google/error-prone/commit/af175b07510ef093b5e76200608d15d3190bbd17"><code>af175b0</code></a> Don't fire the <code>CanIgnoreReturnValueSuggester</code> for `dagger.producers.Producti...</li>
<li><a href="https://github.com/google/error-prone/commit/ba8f9a285f542c8b628cf50e365bd0270b0aac45"><code>ba8f9a2</code></a> Do not update getters that override methods from a superclass.</li>
<li><a href="https://github.com/google/error-prone/commit/a706e8d800abcafd6bfeb48c21bfd953261b0519"><code>a706e8d</code></a> Add ability to suppress warning for the entire AutoValue class</li>
<li><a href="https://github.com/google/error-prone/commit/86df5cf166a68ca99c8a045932ca70196e0a6773"><code>86df5cf</code></a> Convert some simple blocks to return switches using <code>yield</code></li>
<li><a href="https://github.com/google/error-prone/commit/474554a79ae02f6f1482b2e3da78e88c11db37ff"><code>474554a</code></a> Remove <code>// fall out</code> comments, which are sometimes used to document an empty ...</li>
<li><a href="https://github.com/google/error-prone/commit/ac7ebf5a03043abd4af00c9f47a6ba1a46a5e4f0"><code>ac7ebf5</code></a> Handle <code>var</code> in MustBeClosedChecker</li>
<li><a href="https://github.com/google/error-prone/commit/ccd3ca657b2bf41224eccb1633d94046d09ede82"><code>ccd3ca6</code></a> Add handling of toBuilder()</li>
<li><a href="https://github.com/google/error-prone/commit/d88730767480d5c9ac190c4960f783e79c22c457"><code>d887307</code></a> Omit some unnecessary break statements when translating to <code>-&gt;</code> switches</li>
<li><a href="https://github.com/google/error-prone/commit/fe072361840715ae907b28dee66f2c732c255551"><code>fe07236</code></a> Add Error Prone check for unnecessary boxed types in AutoValue classes.</li>
<li>Additional commits viewable in <a href="https://github.com/google/error-prone/compare/v2.29.2...v2.30.0">compare view</a></li>
</ul>
</details>
<br />

Updates `com.google.errorprone:error_prone_core` from 2.29.2 to 2.30.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/google/error-prone/releases">com.google.errorprone:error_prone_core's releases</a>.</em></p>
<blockquote>
<h2>Error Prone 2.30.0</h2>
<p>New checks:</p>
<ul>
<li><a href="https://errorprone.info/bugpattern/AutoValueBoxedValues"><code>AutoValueBoxedValues</code></a>: Detects unnecessary boxing in AutoValue classes.</li>
<li><a href="https://errorprone.info/bugpattern/DefaultLocale"><code>DefaultLocale</code></a>: Detects implicit use of the JVM default locale, which can result in differing behaviour between JVM executions.</li>
<li><a href="https://errorprone.info/bugpattern/UnnecessaryBreakInSwitch"><code>UnnecessaryBreakInSwitch</code></a>: Remove unnecessary <code>break</code> statements in <code>-&gt;</code> switches.</li>
</ul>
<p>Closed issues: <a href="https://redirect.github.com/google/error-prone/issues/632">#632</a>, <a href="https://redirect.github.com/google/error-prone/issues/4487">#4487</a></p>
<p>Full changelog: <a href="https://github.com/google/error-prone/compare/v2.29.2...v2.30.0">https://github.com/google/error-prone/compare/v2.29.2...v2.30.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/google/error-prone/commit/5ada179028452868623a1aa8f11eee2996886454"><code>5ada179</code></a> Release Error Prone 2.30.0</li>
<li><a href="https://github.com/google/error-prone/commit/af175b07510ef093b5e76200608d15d3190bbd17"><code>af175b0</code></a> Don't fire the <code>CanIgnoreReturnValueSuggester</code> for `dagger.producers.Producti...</li>
<li><a href="https://github.com/google/error-prone/commit/ba8f9a285f542c8b628cf50e365bd0270b0aac45"><code>ba8f9a2</code></a> Do not update getters that override methods from a superclass.</li>
<li><a href="https://github.com/google/error-prone/commit/a706e8d800abcafd6bfeb48c21bfd953261b0519"><code>a706e8d</code></a> Add ability to suppress warning for the entire AutoValue class</li>
<li><a href="https://github.com/google/error-prone/commit/86df5cf166a68ca99c8a045932ca70196e0a6773"><code>86df5cf</code></a> Convert some simple blocks to return switches using <code>yield</code></li>
<li><a href="https://github.com/google/error-prone/commit/474554a79ae02f6f1482b2e3da78e88c11db37ff"><code>474554a</code></a> Remove <code>// fall out</code> comments, which are sometimes used to document an empty ...</li>
<li><a href="https://github.com/google/error-prone/commit/ac7ebf5a03043abd4af00c9f47a6ba1a46a5e4f0"><code>ac7ebf5</code></a> Handle <code>var</code> in MustBeClosedChecker</li>
<li><a href="https://github.com/google/error-prone/commit/ccd3ca657b2bf41224eccb1633d94046d09ede82"><code>ccd3ca6</code></a> Add handling of toBuilder()</li>
<li><a href="https://github.com/google/error-prone/commit/d88730767480d5c9ac190c4960f783e79c22c457"><code>d887307</code></a> Omit some unnecessary break statements when translating to <code>-&gt;</code> switches</li>
<li><a href="https://github.com/google/error-prone/commit/fe072361840715ae907b28dee66f2c732c255551"><code>fe07236</code></a> Add Error Prone check for unnecessary boxed types in AutoValue classes.</li>
<li>Additional commits viewable in <a href="https://github.com/google/error-prone/compare/v2.29.2...v2.30.0">compare view</a></li>
</ul>
</details>
<br />

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 45e9f07174b..1524dc32579 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -107,7 +107,7 @@ under the License.
     <forkCount>2</forkCount>
     <checkstyle.version>10.17.0</checkstyle.version>
     <checkstyle.failOnViolation>true</checkstyle.failOnViolation>
-    <error_prone_core.version>2.29.2</error_prone_core.version>
+    <error_prone_core.version>2.30.0</error_prone_core.version>
     <mockito.core.version>5.11.0</mockito.core.version>
     <mockito.inline.version>5.2.0</mockito.inline.version>
     <checker.framework.version>3.46.0</checker.framework.version>

From 906934e4af9f2ec8a402cc87d15a11783fc99950 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 20 Aug 2024 12:19:07 +0900
Subject: [PATCH 041/157] MINOR: [Java] Bump com.h2database:h2 from 2.3.230 to
 2.3.232 in /java (#43654)

Bumps [com.h2database:h2](https://github.com/h2database/h2database) from 2.3.230 to 2.3.232.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/h2database/h2database/releases">com.h2database:h2's releases</a>.</em></p>
<blockquote>
<h2>Version 2.3.232</h2>

</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/h2database/h2database/commit/2e46a1c9680089098d756435ae94193ee72c1334"><code>2e46a1c</code></a> Merge remote-tracking branch 'h2database/master'</li>
<li><a href="https://github.com/h2database/h2database/commit/5badbf973616c6ebd922a0fe37e0562f551e17c8"><code>5badbf9</code></a> in preparation for release</li>
<li><a href="https://github.com/h2database/h2database/commit/c0696ef52257cf8a2bd33c654bbdf4e6c5eabc5c"><code>c0696ef</code></a> Merge pull request <a href="https://redirect.github.com/h2database/h2database/issues/4113">#4113</a> from katzyn/uuid</li>
<li><a href="https://github.com/h2database/h2database/commit/8f8e88c825fdab85b1fe7db6792748e6b35e1240"><code>8f8e88c</code></a> Don't cast to long and back</li>
<li><a href="https://github.com/h2database/h2database/commit/e0895be53c333e9a2fc9273ddf2b2360ed9beada"><code>e0895be</code></a> Fix building of documentation</li>
<li><a href="https://github.com/h2database/h2database/commit/19d4428e5c8018e4240b8057ddc8fa4f307616bf"><code>19d4428</code></a> Add optional version parameter to RANDOM_UUID function</li>
<li><a href="https://github.com/h2database/h2database/commit/bd9ac2fb4193cfcbb1921e7addc5747f00d7f5ea"><code>bd9ac2f</code></a> Merge pull request <a href="https://redirect.github.com/h2database/h2database/issues/4103">#4103</a> from katzyn/map_columns</li>
<li><a href="https://github.com/h2database/h2database/commit/64f2fbe8bd490f8851694f62e5d637550990b32f"><code>64f2fbe</code></a> Pass mapped columns to table filters of subqueries</li>
<li><a href="https://github.com/h2database/h2database/commit/74ed2b53893777c463db2449cf0200817ccd38fa"><code>74ed2b5</code></a> Merge pull request <a href="https://redirect.github.com/h2database/h2database/issues/4094">#4094</a> from andreitokar/issue_4075</li>
<li><a href="https://github.com/h2database/h2database/commit/9d533f1415854fd9296d6e176f371456e32b2420"><code>9d533f1</code></a> Merge pull request <a href="https://redirect.github.com/h2database/h2database/issues/4098">#4098</a> from katzyn/fixes</li>
<li>Additional commits viewable in <a href="https://github.com/h2database/h2database/compare/version-2.3.230...version-2.3.232">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.h2database:h2&package-manager=maven&previous-version=2.3.230&new-version=2.3.232)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/adapter/jdbc/pom.xml | 2 +-
 java/performance/pom.xml  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/adapter/jdbc/pom.xml b/java/adapter/jdbc/pom.xml
index 124cc535c25..099798a95cd 100644
--- a/java/adapter/jdbc/pom.xml
+++ b/java/adapter/jdbc/pom.xml
@@ -59,7 +59,7 @@ under the License.
     <dependency>
       <groupId>com.h2database</groupId>
       <artifactId>h2</artifactId>
-      <version>2.3.230</version>
+      <version>2.3.232</version>
       <scope>test</scope>
     </dependency>
 
diff --git a/java/performance/pom.xml b/java/performance/pom.xml
index f6d3a26b4f3..9f4df1ff2e7 100644
--- a/java/performance/pom.xml
+++ b/java/performance/pom.xml
@@ -75,7 +75,7 @@ under the License.
     <dependency>
       <groupId>com.h2database</groupId>
       <artifactId>h2</artifactId>
-      <version>2.3.230</version>
+      <version>2.3.232</version>
       <scope>runtime</scope>
     </dependency>
     <dependency>

From bd3953f01b2b443a2021027e9beb5e302f74f42d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 19 Aug 2024 20:20:59 -0700
Subject: [PATCH 042/157] MINOR: [C#] Bump Google.Protobuf from 3.27.0 to
 3.27.3 in /csharp (#43754)

Bumps Google.Protobuf from 3.27.0 to 3.27.3.

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Google.Protobuf&package-manager=nuget&previous-version=3.27.0&new-version=3.27.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../Apache.Arrow.Flight.TestWeb.csproj                           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
index e6c7e174fa3..14227e2c4eb 100644
--- a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
+++ b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
@@ -5,6 +5,7 @@
   </PropertyGroup>
 
   <ItemGroup>
+    <PackageReference Include="Google.Protobuf" Version="3.27.3" />
     <PackageReference Include="Grpc.AspNetCore" Version="2.65.0" />
   </ItemGroup>
 

From 70a0189f30cbfe9484681f0d407aed5ca3f4467b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 19 Aug 2024 20:23:45 -0700
Subject: [PATCH 043/157] MINOR: [C#] Bump System.Memory from 4.5.4 to 4.5.5 in
 /csharp (#43755)

Bumps System.Memory from 4.5.4 to 4.5.5.

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=System.Memory&package-manager=nuget&previous-version=4.5.4&new-version=4.5.5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 1 +
 1 file changed, 1 insertion(+)

diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
index a46f0d91935..9e1866f8416 100644
--- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
+++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
@@ -8,6 +8,7 @@
     <PackageReference Include="Google.Protobuf" Version="3.27.3" />
     <PackageReference Include="Grpc.Net.Client" Version="2.65.0" />
     <PackageReference Include="Grpc.Tools" Version="2.65.0" PrivateAssets="All" />
+    <PackageReference Include="System.Memory" Version="4.5.5" />
   </ItemGroup>
 
   <ItemGroup Condition="'$(TargetFramework)'=='netstandard2.0'">

From b0317f2b2b62b3be9beb8d834aa51b776fb0179e Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Tue, 20 Aug 2024 17:04:33 +0900
Subject: [PATCH 044/157] GH-43707: [Python] Fix compilation on Cython<3
 (#43765)

### Rationale for this change

Fix compilation on Cython < 3

### What changes are included in this PR?

Add an explicit cast

### Are these changes tested?

N/A

### Are there any user-facing changes?

No
* GitHub Issue: #43707

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/types.pxi | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 93d68fb8478..dcd2b61c334 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -5328,8 +5328,9 @@ def opaque(DataType storage_type, str type_name not None, str vendor_name not No
     cdef:
         c_string c_type_name = tobytes(type_name)
         c_string c_vendor_name = tobytes(vendor_name)
-        shared_ptr[CDataType] c_type = make_shared[COpaqueType](
+        shared_ptr[COpaqueType] c_opaque_type = make_shared[COpaqueType](
             storage_type.sp_type, c_type_name, c_vendor_name)
+        shared_ptr[CDataType] c_type = static_pointer_cast[CDataType, COpaqueType](c_opaque_type)
         OpaqueType out = OpaqueType.__new__(OpaqueType)
     out.init(c_type)
     return out

From cc3c868aea7317a58447658f1c165ad352cd4865 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 20 Aug 2024 16:57:57 +0200
Subject: [PATCH 045/157] MINOR: [Documentation] Add installation of
 ninja-build to Python Development docs (#43600)

### Rationale for this change

Otherwise, you get a CMake error:
```
CMake Error: CMake was unable to find a build program corresponding to "Ninja".  CMAKE_MAKE_PROGRAM is not set.  You probably need to select a different build tool.
```

Authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 docs/source/developers/python.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst
index 2f3e892ce8e..6beea55e66b 100644
--- a/docs/source/developers/python.rst
+++ b/docs/source/developers/python.rst
@@ -267,7 +267,7 @@ On Debian/Ubuntu, you need the following minimal set of dependencies:
 
 .. code-block::
 
-   $ sudo apt-get install build-essential cmake python3-dev
+   $ sudo apt-get install build-essential ninja-build cmake python3-dev
 
 Now, let's create a Python virtual environment with all Python dependencies
 in the same folder as the repositories, and a target installation folder:

From 525881987d0b9b4f464c3e3593a9a7b4e3c767d0 Mon Sep 17 00:00:00 2001
From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Date: Tue, 20 Aug 2024 20:25:19 -0400
Subject: [PATCH 046/157] GH-17682: [C++][Python] Bool8 Extension Type
 Implementation (#43488)

### Rationale for this change

C++ and Python implementations of #43234

### What changes are included in this PR?

- Implement C++ `Bool8Type`, `Bool8Array`, `Bool8Scalar`, and tests
- Implement Python bindings to C++, as well as zero-copy numpy conversion methods
- TODO: docs waiting for rebase on #43458

### Are these changes tested?

Yes

### Are there any user-facing changes?

Bool8 extension type will be available in C++ and Python libraries

* GitHub Issue: #17682

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
---
 cpp/src/arrow/CMakeLists.txt                |   1 +
 cpp/src/arrow/extension/CMakeLists.txt      |   6 +
 cpp/src/arrow/extension/bool8.cc            |  61 ++++++++
 cpp/src/arrow/extension/bool8.h             |  58 ++++++++
 cpp/src/arrow/extension/bool8_test.cc       |  91 ++++++++++++
 cpp/src/arrow/extension_type.cc             |   7 +-
 python/pyarrow/__init__.py                  |   7 +-
 python/pyarrow/array.pxi                    | 114 ++++++++++++++-
 python/pyarrow/includes/libarrow.pxd        |   9 ++
 python/pyarrow/lib.pxd                      |   3 +
 python/pyarrow/public-api.pxi               |   2 +
 python/pyarrow/scalar.pxi                   |  23 ++-
 python/pyarrow/tests/test_extension_type.py | 152 ++++++++++++++++++++
 python/pyarrow/tests/test_misc.py           |   3 +
 python/pyarrow/types.pxi                    |  74 ++++++++++
 15 files changed, 604 insertions(+), 7 deletions(-)
 create mode 100644 cpp/src/arrow/extension/bool8.cc
 create mode 100644 cpp/src/arrow/extension/bool8.h
 create mode 100644 cpp/src/arrow/extension/bool8_test.cc

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index fb785e1e957..fb7253b6fd6 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -906,6 +906,7 @@ endif()
 
 if(ARROW_JSON)
   arrow_add_object_library(ARROW_JSON
+                           extension/bool8.cc
                            extension/fixed_shape_tensor.cc
                            extension/opaque.cc
                            json/options.cc
diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt
index 6741ab602f5..fcd5fa529ab 100644
--- a/cpp/src/arrow/extension/CMakeLists.txt
+++ b/cpp/src/arrow/extension/CMakeLists.txt
@@ -15,6 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
+add_arrow_test(test
+               SOURCES
+               bool8_test.cc
+               PREFIX
+               "arrow-extension-bool8")
+
 add_arrow_test(test
                SOURCES
                fixed_shape_tensor_test.cc
diff --git a/cpp/src/arrow/extension/bool8.cc b/cpp/src/arrow/extension/bool8.cc
new file mode 100644
index 00000000000..c081f0c2b28
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8.cc
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+
+#include "arrow/extension/bool8.h"
+#include "arrow/util/logging.h"
+
+namespace arrow::extension {
+
+bool Bool8Type::ExtensionEquals(const ExtensionType& other) const {
+  return extension_name() == other.extension_name();
+}
+
+std::string Bool8Type::ToString(bool show_metadata) const {
+  std::stringstream ss;
+  ss << "extension<" << this->extension_name() << ">";
+  return ss.str();
+}
+
+std::string Bool8Type::Serialize() const { return ""; }
+
+Result<std::shared_ptr<DataType>> Bool8Type::Deserialize(
+    std::shared_ptr<DataType> storage_type, const std::string& serialized_data) const {
+  if (storage_type->id() != Type::INT8) {
+    return Status::Invalid("Expected INT8 storage type, got ", storage_type->ToString());
+  }
+  if (serialized_data != "") {
+    return Status::Invalid("Serialize data must be empty, got ", serialized_data);
+  }
+  return bool8();
+}
+
+std::shared_ptr<Array> Bool8Type::MakeArray(std::shared_ptr<ArrayData> data) const {
+  DCHECK_EQ(data->type->id(), Type::EXTENSION);
+  DCHECK_EQ("arrow.bool8",
+            internal::checked_cast<const ExtensionType&>(*data->type).extension_name());
+  return std::make_shared<Bool8Array>(data);
+}
+
+Result<std::shared_ptr<DataType>> Bool8Type::Make() {
+  return std::make_shared<Bool8Type>();
+}
+
+std::shared_ptr<DataType> bool8() { return std::make_shared<Bool8Type>(); }
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/bool8.h b/cpp/src/arrow/extension/bool8.h
new file mode 100644
index 00000000000..02e629b28a8
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension_type.h"
+
+namespace arrow::extension {
+
+/// \brief Bool8 is an alternate representation for boolean
+/// arrays using 8 bits instead of 1 bit per value. The underlying
+/// storage type is int8.
+class ARROW_EXPORT Bool8Array : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+/// \brief Bool8 is an alternate representation for boolean
+/// arrays using 8 bits instead of 1 bit per value. The underlying
+/// storage type is int8.
+class ARROW_EXPORT Bool8Type : public ExtensionType {
+ public:
+  /// \brief Construct a Bool8Type.
+  Bool8Type() : ExtensionType(int8()) {}
+
+  std::string extension_name() const override { return "arrow.bool8"; }
+  std::string ToString(bool show_metadata = false) const override;
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::string Serialize() const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const override;
+
+  /// Create a Bool8Array from ArrayData
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  static Result<std::shared_ptr<DataType>> Make();
+};
+
+/// \brief Return a Bool8Type instance.
+ARROW_EXPORT std::shared_ptr<DataType> bool8();
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/bool8_test.cc b/cpp/src/arrow/extension/bool8_test.cc
new file mode 100644
index 00000000000..eabcfcf62d3
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8_test.cc
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/bool8.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/testing/extension_type.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+
+TEST(Bool8Type, Basics) {
+  auto type = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  auto type2 = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  ASSERT_EQ("arrow.bool8", type->extension_name());
+  ASSERT_EQ(*type, *type);
+  ASSERT_NE(*arrow::null(), *type);
+  ASSERT_EQ(*type, *type2);
+  ASSERT_EQ(*arrow::int8(), *type->storage_type());
+  ASSERT_EQ("", type->Serialize());
+  ASSERT_EQ("extension<arrow.bool8>", type->ToString(false));
+}
+
+TEST(Bool8Type, CreateFromArray) {
+  auto type = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]");
+  auto array = ExtensionType::WrapArray(type, storage);
+  ASSERT_EQ(5, array->length());
+  ASSERT_EQ(1, array->null_count());
+}
+
+TEST(Bool8Type, Deserialize) {
+  auto type = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  ASSERT_OK_AND_ASSIGN(auto deserialized, type->Deserialize(type->storage_type(), ""));
+  ASSERT_EQ(*type, *deserialized);
+  ASSERT_NOT_OK(type->Deserialize(type->storage_type(), "must be empty"));
+  ASSERT_EQ(*type, *deserialized);
+  ASSERT_NOT_OK(type->Deserialize(uint8(), ""));
+  ASSERT_EQ(*type, *deserialized);
+}
+
+TEST(Bool8Type, MetadataRoundTrip) {
+  auto type = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  std::string serialized = type->Serialize();
+  ASSERT_OK_AND_ASSIGN(auto deserialized,
+                       type->Deserialize(type->storage_type(), serialized));
+  ASSERT_EQ(*type, *deserialized);
+}
+
+TEST(Bool8Type, BatchRoundTrip) {
+  auto type = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+
+  auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]");
+  auto array = ExtensionType::WrapArray(type, storage);
+  auto batch =
+      RecordBatch::Make(schema({field("field", type)}), array->length(), {array});
+
+  std::shared_ptr<RecordBatch> written;
+  {
+    ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
+    ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
+                                          out_stream.get()));
+
+    ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
+
+    io::BufferReader reader(complete_ipc_stream);
+    std::shared_ptr<RecordBatchReader> batch_reader;
+    ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
+    ASSERT_OK(batch_reader->ReadNext(&written));
+  }
+
+  ASSERT_EQ(*batch->schema(), *written->schema());
+  ASSERT_BATCHES_EQUAL(*batch, *written);
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index cf8dda7a85d..685018f7de7 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -28,6 +28,7 @@
 #include "arrow/chunked_array.h"
 #include "arrow/config.h"
 #ifdef ARROW_JSON
+#include "arrow/extension/bool8.h"
 #include "arrow/extension/fixed_shape_tensor.h"
 #endif
 #include "arrow/status.h"
@@ -146,10 +147,12 @@ static void CreateGlobalRegistry() {
 
 #ifdef ARROW_JSON
   // Register canonical extension types
-  auto ext_type =
+  auto fst_ext_type =
       checked_pointer_cast<ExtensionType>(extension::fixed_shape_tensor(int64(), {}));
+  ARROW_CHECK_OK(g_registry->RegisterType(fst_ext_type));
 
-  ARROW_CHECK_OK(g_registry->RegisterType(ext_type));
+  auto bool8_ext_type = checked_pointer_cast<ExtensionType>(extension::bool8());
+  ARROW_CHECK_OK(g_registry->RegisterType(bool8_ext_type));
 #endif
 }
 
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index aa7bab9f97e..807bcdc3150 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -174,6 +174,7 @@ def print_entry(label, value):
                          run_end_encoded,
                          fixed_shape_tensor,
                          opaque,
+                         bool8,
                          field,
                          type_for_alias,
                          DataType, DictionaryType, StructType,
@@ -184,7 +185,7 @@ def print_entry(label, value):
                          FixedSizeBinaryType, Decimal128Type, Decimal256Type,
                          BaseExtensionType, ExtensionType,
                          RunEndEncodedType, FixedShapeTensorType, OpaqueType,
-                         PyExtensionType, UnknownExtensionType,
+                         Bool8Type, PyExtensionType, UnknownExtensionType,
                          register_extension_type, unregister_extension_type,
                          DictionaryMemo,
                          KeyValueMetadata,
@@ -218,7 +219,7 @@ def print_entry(label, value):
                          MonthDayNanoIntervalArray,
                          Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
                          RunEndEncodedArray, FixedShapeTensorArray, OpaqueArray,
-                         scalar, NA, _NULL as NULL, Scalar,
+                         Bool8Array, scalar, NA, _NULL as NULL, Scalar,
                          NullScalar, BooleanScalar,
                          Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
                          UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
@@ -235,7 +236,7 @@ def print_entry(label, value):
                          FixedSizeBinaryScalar, DictionaryScalar,
                          MapScalar, StructScalar, UnionScalar,
                          RunEndEncodedScalar, ExtensionScalar,
-                         FixedShapeTensorScalar, OpaqueScalar)
+                         FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar)
 
 # Buffers, allocation
 from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 6c40a21db96..4c3eb932326 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1581,7 +1581,7 @@ cdef class Array(_PandasConvertible):
 
     def to_numpy(self, zero_copy_only=True, writable=False):
         """
-        Return a NumPy view or copy of this array (experimental).
+        Return a NumPy view or copy of this array.
 
         By default, tries to return a view of this array. This is only
         supported for primitive arrays with the same memory layout as NumPy
@@ -4476,6 +4476,118 @@ cdef class OpaqueArray(ExtensionArray):
     """
 
 
+cdef class Bool8Array(ExtensionArray):
+    """
+    Concrete class for bool8 extension arrays.
+
+    Examples
+    --------
+    Define the extension type for an bool8 array
+
+    >>> import pyarrow as pa
+    >>> bool8_type = pa.bool8()
+
+    Create an extension array
+
+    >>> arr = [-1, 0, 1, 2, None]
+    >>> storage = pa.array(arr, pa.int8())
+    >>> pa.ExtensionArray.from_storage(bool8_type, storage)
+    <pyarrow.lib.Bool8Array object at ...>
+    [
+      -1,
+      0,
+      1,
+      2,
+      null
+    ]
+    """
+
+    def to_numpy(self, zero_copy_only=True, writable=False):
+        """
+        Return a NumPy bool view or copy of this array.
+
+        By default, tries to return a view of this array. This is only
+        supported for arrays without any nulls.
+
+        Parameters
+        ----------
+        zero_copy_only : bool, default True
+            If True, an exception will be raised if the conversion to a numpy
+            array would require copying the underlying data (e.g. in presence
+            of nulls).
+        writable : bool, default False
+            For numpy arrays created with zero copy (view on the Arrow data),
+            the resulting array is not writable (Arrow data is immutable).
+            By setting this to True, a copy of the array is made to ensure
+            it is writable.
+
+        Returns
+        -------
+        array : numpy.ndarray
+        """
+        if not writable:
+            try:
+                return self.storage.to_numpy().view(np.bool_)
+            except ArrowInvalid as e:
+                if zero_copy_only:
+                    raise e
+
+        return _pc().not_equal(self.storage, 0).to_numpy(zero_copy_only=zero_copy_only, writable=writable)
+
+    @staticmethod
+    def from_storage(Int8Array storage):
+        """
+        Construct Bool8Array from Int8Array storage.
+
+        Parameters
+        ----------
+        storage : Int8Array
+            The underlying storage for the result array.
+
+        Returns
+        -------
+        bool8_array : Bool8Array
+        """
+        return ExtensionArray.from_storage(bool8(), storage)
+
+    @staticmethod
+    def from_numpy(obj):
+        """
+        Convert numpy array to a bool8 extension array without making a copy.
+        The input array must be 1-dimensional, with either bool_ or int8 dtype.
+
+        Parameters
+        ----------
+        obj : numpy.ndarray
+
+        Returns
+        -------
+        bool8_array : Bool8Array
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> import numpy as np
+        >>> arr = np.array([True, False, True], dtype=np.bool_)
+        >>> pa.Bool8Array.from_numpy(arr)
+        <pyarrow.lib.Bool8Array object at ...>
+        [
+          1,
+          0,
+          1
+        ]
+        """
+
+        if obj.ndim != 1:
+            raise ValueError(f"Cannot convert {obj.ndim}-D array to bool8 array")
+
+        if obj.dtype not in [np.bool_, np.int8]:
+            raise TypeError(f"Array dtype {obj.dtype} incompatible with bool8 storage")
+
+        storage_arr = array(obj.view(np.int8), type=int8())
+        return Bool8Array.from_storage(storage_arr)
+
+
 cdef dict _array_classes = {
     _Type_NA: NullArray,
     _Type_BOOL: BooleanArray,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 9b008d150f1..a54a1db292f 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2895,6 +2895,15 @@ cdef extern from "arrow/extension/opaque.h" namespace "arrow::extension" nogil:
         pass
 
 
+cdef extern from "arrow/extension/bool8.h" namespace "arrow::extension" nogil:
+    cdef cppclass CBool8Type" arrow::extension::Bool8Type"(CExtensionType):
+
+        @staticmethod
+        CResult[shared_ptr[CDataType]] Make()
+
+    cdef cppclass CBool8Array" arrow::extension::Bool8Array"(CExtensionArray):
+        pass
+
 cdef extern from "arrow/util/compression.h" namespace "arrow" nogil:
     cdef enum CCompressionType" arrow::Compression::type":
         CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED"
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 2cb302d20a8..e3625c18152 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -214,6 +214,9 @@ cdef class FixedShapeTensorType(BaseExtensionType):
     cdef:
         const CFixedShapeTensorType* tensor_ext_type
 
+cdef class Bool8Type(BaseExtensionType):
+    cdef:
+        const CBool8Type* bool8_ext_type
 
 cdef class OpaqueType(BaseExtensionType):
     cdef:
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 2f9fc1c5542..19a26bd6c68 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -126,6 +126,8 @@ cdef api object pyarrow_wrap_data_type(
             out = FixedShapeTensorType.__new__(FixedShapeTensorType)
         elif ext_type.extension_name() == b"arrow.opaque":
             out = OpaqueType.__new__(OpaqueType)
+        elif ext_type.extension_name() == b"arrow.bool8":
+            out = Bool8Type.__new__(Bool8Type)
         else:
             out = BaseExtensionType.__new__(BaseExtensionType)
     else:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 12a99c2aece..72ae2aee5f8 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -1091,6 +1091,18 @@ cdef class OpaqueScalar(ExtensionScalar):
     """
 
 
+cdef class Bool8Scalar(ExtensionScalar):
+    """
+    Concrete class for bool8 extension scalar.
+    """
+
+    def as_py(self):
+        """
+        Return this scalar as a Python object.
+        """
+        py_val = super().as_py()
+        return None if py_val is None else py_val != 0
+
 cdef dict _scalar_classes = {
     _Type_BOOL: BooleanScalar,
     _Type_UINT8: UInt8Scalar,
@@ -1199,6 +1211,11 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
     type = ensure_type(type, allow_none=True)
     pool = maybe_unbox_memory_pool(memory_pool)
 
+    extension_type = None
+    if type is not None and type.id == _Type_EXTENSION:
+        extension_type = type
+        type = type.storage_type
+
     if _is_array_like(value):
         value = get_values(value, &is_pandas_object)
 
@@ -1223,4 +1240,8 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
 
     # retrieve the scalar from the first position
     scalar = GetResultValue(array.get().GetScalar(0))
-    return Scalar.wrap(scalar)
+    result = Scalar.wrap(scalar)
+
+    if extension_type is not None:
+        result = ExtensionScalar.from_storage(extension_type, result)
+    return result
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index 58c54189f22..b04ee85ec99 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1707,3 +1707,155 @@ def test_opaque_type(pickle_module, storage_type, storage):
     # cast extension type -> storage type
     inner = arr.cast(storage_type)
     assert inner == storage
+
+
+def test_bool8_type(pickle_module):
+    bool8_type = pa.bool8()
+    storage_type = pa.int8()
+    assert bool8_type.extension_name == "arrow.bool8"
+    assert bool8_type.storage_type == storage_type
+    assert str(bool8_type) == "extension<arrow.bool8>"
+
+    assert bool8_type == bool8_type
+    assert bool8_type == pa.bool8()
+    assert bool8_type != storage_type
+
+    # Pickle roundtrip
+    result = pickle_module.loads(pickle_module.dumps(bool8_type))
+    assert result == bool8_type
+
+    # IPC roundtrip
+    storage = pa.array([-1, 0, 1, 2, None], storage_type)
+    arr = pa.ExtensionArray.from_storage(bool8_type, storage)
+    assert isinstance(arr, pa.Bool8Array)
+
+    # extension is registered by default
+    buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
+    batch = ipc_read_batch(buf)
+
+    assert batch.column(0).type.extension_name == "arrow.bool8"
+    assert isinstance(batch.column(0), pa.Bool8Array)
+
+    # cast storage -> extension type
+    result = storage.cast(bool8_type)
+    assert result == arr
+
+    # cast extension type -> storage type
+    inner = arr.cast(storage_type)
+    assert inner == storage
+
+
+def test_bool8_to_bool_conversion():
+    bool_arr = pa.array([True, False, True, True, None], pa.bool_())
+    bool8_arr = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2, None], pa.int8()),
+    )
+
+    # cast extension type -> arrow boolean type
+    assert bool8_arr.cast(pa.bool_()) == bool_arr
+
+    # cast arrow boolean type -> extension type, expecting canonical values
+    canonical_storage = pa.array([1, 0, 1, 1, None], pa.int8())
+    canonical_bool8_arr = pa.ExtensionArray.from_storage(pa.bool8(), canonical_storage)
+    assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr
+
+
+def test_bool8_to_numpy_conversion():
+    arr = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2, None], pa.int8()),
+    )
+
+    # cannot zero-copy with nulls
+    with pytest.raises(
+        pa.ArrowInvalid,
+        match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was True",
+    ):
+        arr.to_numpy()
+
+    # nullable conversion possible with a copy, but dest dtype is object
+    assert np.array_equal(
+        arr.to_numpy(zero_copy_only=False),
+        np.array([True, False, True, True, None], dtype=np.object_),
+    )
+
+    # zero-copy possible with non-null array
+    np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+    arr_no_nulls = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2], pa.int8()),
+    )
+
+    arr_to_np = arr_no_nulls.to_numpy()
+    assert np.array_equal(arr_to_np, np_arr_no_nulls)
+
+    # same underlying buffer
+    assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address
+
+    # if the user requests a writable array, a copy should be performed
+    arr_to_np_writable = arr_no_nulls.to_numpy(zero_copy_only=False, writable=True)
+    assert np.array_equal(arr_to_np_writable, np_arr_no_nulls)
+
+    # different underlying buffer
+    assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address
+
+
+def test_bool8_from_numpy_conversion():
+    np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+    canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([1, 0, 1, 1], pa.int8()),
+    )
+
+    arr_from_np = pa.Bool8Array.from_numpy(np_arr_no_nulls)
+    assert arr_from_np == canonical_bool8_arr_no_nulls
+
+    # same underlying buffer
+    assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data
+
+    # conversion only valid for 1-D arrays
+    with pytest.raises(
+        ValueError,
+        match="Cannot convert 2-D array to bool8 array",
+    ):
+        pa.Bool8Array.from_numpy(
+            np.array([[True, False], [False, True]], dtype=np.bool_),
+        )
+
+    with pytest.raises(
+        ValueError,
+        match="Cannot convert 0-D array to bool8 array",
+    ):
+        pa.Bool8Array.from_numpy(np.bool_())
+
+    # must use compatible storage type
+    with pytest.raises(
+        TypeError,
+        match="Array dtype float64 incompatible with bool8 storage",
+    ):
+        pa.Bool8Array.from_numpy(np.array([1, 2, 3], dtype=np.float64))
+
+
+def test_bool8_scalar():
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() is True
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py() is False
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), 1).as_py() is True
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), 2).as_py() is True
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), None).as_py() is None
+
+    arr = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2, None], pa.int8()),
+    )
+    assert arr[0].as_py() is True
+    assert arr[1].as_py() is False
+    assert arr[2].as_py() is True
+    assert arr[3].as_py() is True
+    assert arr[4].as_py() is None
+
+    assert pa.scalar(-1, type=pa.bool8()).as_py() is True
+    assert pa.scalar(0, type=pa.bool8()).as_py() is False
+    assert pa.scalar(1, type=pa.bool8()).as_py() is True
+    assert pa.scalar(2, type=pa.bool8()).as_py() is True
+    assert pa.scalar(None, type=pa.bool8()).as_py() is None
diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py
index 9a55a38177f..5d3471c7c35 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -250,6 +250,9 @@ def test_set_timezone_db_path_non_windows():
     pa.OpaqueArray,
     pa.OpaqueScalar,
     pa.OpaqueType,
+    pa.Bool8Array,
+    pa.Bool8Scalar,
+    pa.Bool8Type,
 ])
 def test_extension_type_constructor_errors(klass):
     # ARROW-2638: prevent calling extension class constructors directly
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index dcd2b61c334..563782f0c26 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1837,6 +1837,37 @@ cdef class FixedShapeTensorType(BaseExtensionType):
         return FixedShapeTensorScalar
 
 
+cdef class Bool8Type(BaseExtensionType):
+    """
+    Concrete class for bool8 extension type.
+
+    Bool8 is an alternate representation for boolean
+    arrays using 8 bits instead of 1 bit per value. The underlying
+    storage type is int8.
+
+    Examples
+    --------
+    Create an instance of bool8 extension type:
+
+    >>> import pyarrow as pa
+    >>> pa.bool8()
+    Bool8Type(extension<arrow.bool8>)
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        BaseExtensionType.init(self, type)
+        self.bool8_ext_type = <const CBool8Type*> type.get()
+
+    def __arrow_ext_class__(self):
+        return Bool8Array
+
+    def __reduce__(self):
+        return bool8, ()
+
+    def __arrow_ext_scalar_class__(self):
+        return Bool8Scalar
+
+
 cdef class OpaqueType(BaseExtensionType):
     """
     Concrete class for opaque extension type.
@@ -5278,6 +5309,49 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N
     return out
 
 
+def bool8():
+    """
+    Create instance of bool8 extension type.
+
+    Examples
+    --------
+    Create an instance of bool8 extension type:
+
+    >>> import pyarrow as pa
+    >>> type = pa.bool8()
+    >>> type
+    Bool8Type(extension<arrow.bool8>)
+
+    Inspect the data type:
+
+    >>> type.storage_type
+    DataType(int8)
+
+    Create a table with a bool8 array:
+
+    >>> arr = [-1, 0, 1, 2, None]
+    >>> storage = pa.array(arr, pa.int8())
+    >>> other = pa.ExtensionArray.from_storage(type, storage)
+    >>> pa.table([other], names=["unknown_col"])
+    pyarrow.Table
+    unknown_col: extension<arrow.bool8>
+    ----
+    unknown_col: [[-1,0,1,2,null]]
+
+    Returns
+    -------
+    type : Bool8Type
+    """
+
+    cdef Bool8Type out = Bool8Type.__new__(Bool8Type)
+
+    c_type = GetResultValue(CBool8Type.Make())
+
+    out.init(c_type)
+
+    return out
+
+
 def opaque(DataType storage_type, str type_name not None, str vendor_name not None):
     """
     Create instance of opaque extension type.

From 27c22389579dd773d9701f5d3c743bbfca3bdb8e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 21 Aug 2024 14:38:12 +0900
Subject: [PATCH 047/157] MINOR: [Java] Bump
 org.codehaus.mojo:exec-maven-plugin from 3.3.0 to 3.4.1 in /java (#43692)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [org.codehaus.mojo:exec-maven-plugin](https://github.com/mojohaus/exec-maven-plugin) from 3.3.0 to 3.4.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/mojohaus/exec-maven-plugin/releases">org.codehaus.mojo:exec-maven-plugin's releases</a>.</em></p>
<blockquote>
<h2>3.4.1</h2>

<h2>🐛 Bug Fixes</h2>
<ul>
<li>Environment variable Path should be used as case-insensitive (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/442">#442</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
<li>fix: NPE because declared MavenSession field hides field of superclass (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/439">#439</a>) <a href="https://github.com/sebthom"><code>@​sebthom</code></a></li>
</ul>
<h2>📦 Dependency updates</h2>
<ul>
<li>Bump org.codehaus.mojo:mojo-parent from 84 to 85 (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/441">#441</a>) <a href="https://github.com/dependabot"><code>@​dependabot</code></a></li>
</ul>
<h2>👻 Maintenance</h2>
<ul>
<li>Remove redundant spotless configuration (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/440">#440</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
</ul>
<h2>🔧 Build</h2>
<ul>
<li>Use Maven4 enabled with GH Action (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/443">#443</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
<li>Use shared release drafter GH Action (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/444">#444</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
</ul>
<h2>3.4.0</h2>

<h2>🚀 New features and improvements</h2>
<ul>
<li>Allow <code>&lt;includePluginDependencies&gt;</code> to be specified for the exec:exec goal (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/432">#432</a>) <a href="https://github.com/sebthom"><code>@​sebthom</code></a></li>
</ul>
<h2>🐛 Bug Fixes</h2>
<ul>
<li>Do not get UPPERCASE env vars (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/427">#427</a>) <a href="https://github.com/wheezil"><code>@​wheezil</code></a></li>
</ul>
<h2>📦 Dependency updates</h2>
<ul>
<li>Bump org.codehaus.mojo:mojo-parent from 82 to 84 (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/434">#434</a>) <a href="https://github.com/dependabot"><code>@​dependabot</code></a></li>
<li>Bump org.codehaus.plexus:plexus-xml from 3.0.0 to 3.0.1 (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/431">#431</a>) <a href="https://github.com/dependabot"><code>@​dependabot</code></a></li>
</ul>
<h2>👻 Maintenance</h2>
<ul>
<li>Remove Log4j 1.2.x from ITs (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/437">#437</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
</ul>
<h2>🔧 Build</h2>
<ul>
<li>Use Maven 3.9.7 and 4.0.0-beta-3 (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/433">#433</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/7b0be2cd7809190d615e02905a7cb7abf5470ac0"><code>7b0be2c</code></a> [maven-release-plugin] prepare release 3.4.1</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/5ac4f80f7c1d6f3b493b44b48afd7a28a454277d"><code>5ac4f80</code></a> Environment variable Path should be used as case-insensitive</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/cfb3a9fdd679689cea5ad782e4a831a3b8636ccc"><code>cfb3a9f</code></a> Use Maven4 enabled with GH Action</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/d0ded48c487b9b61f41974be67e2c5b1236c0768"><code>d0ded48</code></a> Use shared release drafter GH Action</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/4c229549403d2ce17f8c965f3d955b85d8f78979"><code>4c22954</code></a> Bump org.codehaus.mojo:mojo-parent from 84 to 85</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/a8c4f94c668c1f60dfcbb39f4c4304db2f910d99"><code>a8c4f94</code></a> fix: NPE because declared MavenSession field hides field of superclass</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/a2b735ffcf72a0dfd33c14cf5ced91a6e7ee5e85"><code>a2b735f</code></a> Remove redundant spotless configuration</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/8e0e83c5644c495b0b57fce735f168ccb477fe46"><code>8e0e83c</code></a> [maven-release-plugin] prepare for next development iteration</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/6c4996f4584524e3d7332bf648910e07aff6f559"><code>6c4996f</code></a> [maven-release-plugin] prepare release 3.4.0</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/c7ad6710729f8d69fc03b96d01707125534e7931"><code>c7ad671</code></a> Remove Log4j 1.2.x from ITs</li>
<li>Additional commits viewable in <a href="https://github.com/mojohaus/exec-maven-plugin/compare/3.3.0...3.4.1">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.codehaus.mojo:exec-maven-plugin&package-manager=maven&previous-version=3.3.0&new-version=3.4.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 1524dc32579..0f3e5760f2b 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -504,7 +504,7 @@ under the License.
         <plugin>
           <groupId>org.codehaus.mojo</groupId>
           <artifactId>exec-maven-plugin</artifactId>
-          <version>3.3.0</version>
+          <version>3.4.1</version>
         </plugin>
         <plugin>
           <groupId>org.codehaus.mojo</groupId>

From 4af1e491df7ac22217656668b65c3e8d55f5b5ab Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 21 Aug 2024 14:56:44 +0900
Subject: [PATCH 048/157] MINOR: [Java] Bump io.grpc:grpc-bom from 1.65.0 to
 1.66.0 in /java (#43657)

Bumps [io.grpc:grpc-bom](https://github.com/grpc/grpc-java) from 1.65.0 to 1.66.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/grpc/grpc-java/releases">io.grpc:grpc-bom's releases</a>.</em></p>
<blockquote>
<h2>v1.65.1</h2>
<h2>What's Changed</h2>
<ul>
<li>netty: Restore old behavior of NettyAdaptiveCumulator, but avoid using that class if Netty is on version 4.1.111 or later</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/grpc/grpc-java/commit/cf784069508fc5767a85c915e43bb43ccfc84c76"><code>cf78406</code></a> Bump version to 1.66.0</li>
<li><a href="https://github.com/grpc/grpc-java/commit/33af0a75fda4dcf2ff166ab400fc89b9066a7641"><code>33af0a7</code></a> Update README etc to reference 1.66.0</li>
<li><a href="https://github.com/grpc/grpc-java/commit/19c9b998b1efd8bee3fb40335e1a5f7360859cfb"><code>19c9b99</code></a> xds: XdsClient should unsubscribe on last resource (<a href="https://redirect.github.com/grpc/grpc-java/issues/11264">#11264</a>)</li>
<li><a href="https://github.com/grpc/grpc-java/commit/752a045f10d59286d196c203d9177c6fb191241f"><code>752a045</code></a> Revert &quot;Start 1.67.0 development cycle (<a href="https://redirect.github.com/grpc/grpc-java/issues/11416">#11416</a>)&quot; (<a href="https://redirect.github.com/grpc/grpc-java/issues/11428">#11428</a>)</li>
<li><a href="https://github.com/grpc/grpc-java/commit/ef09d94fe8d51aca13f3490f599ebbfabf3299ab"><code>ef09d94</code></a> Revert &quot;Introduce onResult2 in NameResolver Listener2 that returns Status (<a href="https://redirect.github.com/grpc/grpc-java/issues/1">#1</a>...</li>
<li><a href="https://github.com/grpc/grpc-java/commit/c37fb181a4a803a0d4c4ad2733468c12004be59c"><code>c37fb18</code></a> Start 1.67.0 development cycle</li>
<li><a href="https://github.com/grpc/grpc-java/commit/9ba2f9dec5c71a5d0afbba0f196331a47844bc07"><code>9ba2f9d</code></a> Introduce onResult2 in NameResolver Listener2 that returns Status (<a href="https://redirect.github.com/grpc/grpc-java/issues/11313">#11313</a>)</li>
<li><a href="https://github.com/grpc/grpc-java/commit/786523dca4461597072cc2b86e827d18a34e6440"><code>786523d</code></a> xds: WRR rr_fallback should trigger with one endpoint weight</li>
<li><a href="https://github.com/grpc/grpc-java/commit/b108ed3ddf08d20926cab1ea4ddd75264aff8c18"><code>b108ed3</code></a> api: Give instruments a toString() including their name</li>
<li><a href="https://github.com/grpc/grpc-java/commit/eb4cdf7959795d70c44aa74d572ddc5f8bd2ac5e"><code>eb4cdf7</code></a> Update MAINTAINERS.md (<a href="https://redirect.github.com/grpc/grpc-java/issues/11241">#11241</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/grpc/grpc-java/compare/v1.65.0...v1.66.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=io.grpc:grpc-bom&package-manager=maven&previous-version=1.65.0&new-version=1.66.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 0f3e5760f2b..a73453df68f 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -97,7 +97,7 @@ under the License.
     <dep.slf4j.version>2.0.13</dep.slf4j.version>
     <dep.guava-bom.version>33.2.1-jre</dep.guava-bom.version>
     <dep.netty-bom.version>4.1.112.Final</dep.netty-bom.version>
-    <dep.grpc-bom.version>1.65.0</dep.grpc-bom.version>
+    <dep.grpc-bom.version>1.66.0</dep.grpc-bom.version>
     <dep.protobuf-bom.version>3.25.4</dep.protobuf-bom.version>
     <dep.jackson-bom.version>2.17.2</dep.jackson-bom.version>
     <dep.hadoop.version>3.4.0</dep.hadoop.version>

From 9fc03015463a8f1cb616b088342b104fbc767a0c Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 21 Aug 2024 09:22:53 +0200
Subject: [PATCH 049/157] GH-43069: [Python] Use Py_IsFinalizing from
 pythoncapi_compat.h (#43767)

### Rationale for this change

https://github.com/apache/arrow/pull/43540 already vendored `pythoncapi_compat.h`, so closing https://github.com/apache/arrow/issues/43069 by using this as well for `Py_IsFinalizing` (which was added in https://github.com/apache/arrow/pull/42034, and for which we opened that follow-up issue to use  `pythoncapi_compat.h` instead)

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/src/arrow/python/udf.cc | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/udf.cc b/python/pyarrow/src/arrow/python/udf.cc
index 2c1e97c3ea0..74f16899c47 100644
--- a/python/pyarrow/src/arrow/python/udf.cc
+++ b/python/pyarrow/src/arrow/python/udf.cc
@@ -24,14 +24,11 @@
 #include "arrow/compute/kernel.h"
 #include "arrow/compute/row/grouper.h"
 #include "arrow/python/common.h"
+#include "arrow/python/vendored/pythoncapi_compat.h"
 #include "arrow/table.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 
-// Py_IsFinalizing added in Python 3.13.0a4
-#if PY_VERSION_HEX < 0x030D00A4
-#define Py_IsFinalizing() _Py_IsFinalizing()
-#endif
 namespace arrow {
 using compute::ExecSpan;
 using compute::Grouper;

From e1e7c501019ac26c896d61fa0c129eee83da9b55 Mon Sep 17 00:00:00 2001
From: Oliver Layer <o.layer@celonis.de>
Date: Wed, 21 Aug 2024 13:22:57 +0200
Subject: [PATCH 050/157] GH-40036: [C++] Azure file system write buffering &
 async writes (#43096)

### Rationale for this change

See #40036.

### What changes are included in this PR?

Write buffering and async writes (similar to what the S3 file system does) in the `ObjectAppendStream` for the Azure file system.

With write buffering and async writes, the input scenario creation runtime in the tests (which uses the `ObjectAppendStream` against Azurite) decreased from ~25s (see [here](https://github.com/apache/arrow/issues/40036)) to ~800ms:
```
[ RUN      ] TestAzuriteFileSystem.OpenInputFileMixedReadVsReadAt
[       OK ] TestAzuriteFileSystem.OpenInputFileMixedReadVsReadAt (787 ms)
```

### Are these changes tested?
Added some tests with background writes enabled and disabled (some were taken from the S3 tests). Everything changed should be covered.

### Are there any user-facing changes?
`AzureOptions` now allows for `background_writes` to be set (default: true). No breaking changes.

### Notes

- The code in `DoWrite` is very similar to [the code in the S3 FS](https://github.com/apache/arrow/blob/edfa343eeca008513f0300924380e1b187cc976b/cpp/src/arrow/filesystem/s3fs.cc#L1753). Maybe this could be unified? I didn't see this in the scope of the PR though.
* GitHub Issue: #40036

Lead-authored-by: Oliver Layer <o.layer@celonis.de>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/filesystem/azurefs.cc      | 276 ++++++++++++++++++++---
 cpp/src/arrow/filesystem/azurefs.h       |   3 +
 cpp/src/arrow/filesystem/azurefs_test.cc | 264 ++++++++++++++++++----
 3 files changed, 471 insertions(+), 72 deletions(-)

diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc
index 9b3c0c0c1d7..0bad8563397 100644
--- a/cpp/src/arrow/filesystem/azurefs.cc
+++ b/cpp/src/arrow/filesystem/azurefs.cc
@@ -22,6 +22,7 @@
 
 #include "arrow/filesystem/azurefs.h"
 #include "arrow/filesystem/azurefs_internal.h"
+#include "arrow/io/memory.h"
 
 // idenfity.hpp triggers -Wattributes warnings cause -Werror builds to fail,
 // so disable it for this file with pragmas.
@@ -144,6 +145,9 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
         blob_storage_scheme = "http";
         dfs_storage_scheme = "http";
       }
+    } else if (kv.first == "background_writes") {
+      ARROW_ASSIGN_OR_RAISE(background_writes,
+                            ::arrow::internal::ParseBoolean(kv.second));
     } else {
       return Status::Invalid(
           "Unexpected query parameter in Azure Blob File System URI: '", kv.first, "'");
@@ -937,8 +941,8 @@ Status CommitBlockList(std::shared_ptr<Storage::Blobs::BlockBlobClient> block_bl
                        const std::vector<std::string>& block_ids,
                        const Blobs::CommitBlockListOptions& options) {
   try {
-    // CommitBlockList puts all block_ids in the latest element. That means in the case of
-    // overlapping block_ids the newly staged block ids will always replace the
+    // CommitBlockList puts all block_ids in the latest element. That means in the case
+    // of overlapping block_ids the newly staged block ids will always replace the
     // previously committed blocks.
     // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block-list?tabs=microsoft-entra-id#request-body
     block_blob_client->CommitBlockList(block_ids, options);
@@ -950,7 +954,34 @@ Status CommitBlockList(std::shared_ptr<Storage::Blobs::BlockBlobClient> block_bl
   return Status::OK();
 }
 
+Status StageBlock(Blobs::BlockBlobClient* block_blob_client, const std::string& id,
+                  Core::IO::MemoryBodyStream& content) {
+  try {
+    block_blob_client->StageBlock(id, content);
+  } catch (const Storage::StorageException& exception) {
+    return ExceptionToStatus(
+        exception, "StageBlock failed for '", block_blob_client->GetUrl(),
+        "' new_block_id: '", id,
+        "'. Staging new blocks is fundamental to streaming writes to blob storage.");
+  }
+
+  return Status::OK();
+}
+
+/// Writes will be buffered up to this size (in bytes) before actually uploading them.
+static constexpr int64_t kBlockUploadSizeBytes = 10 * 1024 * 1024;
+/// The maximum size of a block in Azure Blob (as per docs).
+static constexpr int64_t kMaxBlockSizeBytes = 4UL * 1024 * 1024 * 1024;
+
+/// This output stream, similar to other arrow OutputStreams, is not thread-safe.
 class ObjectAppendStream final : public io::OutputStream {
+ private:
+  struct UploadState;
+
+  std::shared_ptr<ObjectAppendStream> Self() {
+    return std::dynamic_pointer_cast<ObjectAppendStream>(shared_from_this());
+  }
+
  public:
   ObjectAppendStream(std::shared_ptr<Blobs::BlockBlobClient> block_blob_client,
                      const io::IOContext& io_context, const AzureLocation& location,
@@ -958,7 +989,8 @@ class ObjectAppendStream final : public io::OutputStream {
                      const AzureOptions& options)
       : block_blob_client_(std::move(block_blob_client)),
         io_context_(io_context),
-        location_(location) {
+        location_(location),
+        background_writes_(options.background_writes) {
     if (metadata && metadata->size() != 0) {
       ArrowMetadataToCommitBlockListOptions(metadata, commit_block_list_options_);
     } else if (options.default_metadata && options.default_metadata->size() != 0) {
@@ -1008,10 +1040,13 @@ class ObjectAppendStream final : public io::OutputStream {
         content_length_ = 0;
       }
     }
+
+    upload_state_ = std::make_shared<UploadState>();
+
     if (content_length_ > 0) {
       ARROW_ASSIGN_OR_RAISE(auto block_list, GetBlockList(block_blob_client_));
       for (auto block : block_list.CommittedBlocks) {
-        block_ids_.push_back(block.Name);
+        upload_state_->block_ids.push_back(block.Name);
       }
     }
     initialised_ = true;
@@ -1031,12 +1066,34 @@ class ObjectAppendStream final : public io::OutputStream {
     if (closed_) {
       return Status::OK();
     }
+
+    if (current_block_) {
+      // Upload remaining buffer
+      RETURN_NOT_OK(AppendCurrentBlock());
+    }
+
     RETURN_NOT_OK(Flush());
     block_blob_client_ = nullptr;
     closed_ = true;
     return Status::OK();
   }
 
+  Future<> CloseAsync() override {
+    if (closed_) {
+      return Status::OK();
+    }
+
+    if (current_block_) {
+      // Upload remaining buffer
+      RETURN_NOT_OK(AppendCurrentBlock());
+    }
+
+    return FlushAsync().Then([self = Self()]() {
+      self->block_blob_client_ = nullptr;
+      self->closed_ = true;
+    });
+  }
+
   bool closed() const override { return closed_; }
 
   Status CheckClosed(const char* action) const {
@@ -1052,11 +1109,11 @@ class ObjectAppendStream final : public io::OutputStream {
   }
 
   Status Write(const std::shared_ptr<Buffer>& buffer) override {
-    return DoAppend(buffer->data(), buffer->size(), buffer);
+    return DoWrite(buffer->data(), buffer->size(), buffer);
   }
 
   Status Write(const void* data, int64_t nbytes) override {
-    return DoAppend(data, nbytes);
+    return DoWrite(data, nbytes);
   }
 
   Status Flush() override {
@@ -1066,20 +1123,111 @@ class ObjectAppendStream final : public io::OutputStream {
       // flush. This also avoids some unhandled errors when flushing in the destructor.
       return Status::OK();
     }
-    return CommitBlockList(block_blob_client_, block_ids_, commit_block_list_options_);
+
+    Future<> pending_blocks_completed;
+    {
+      std::unique_lock<std::mutex> lock(upload_state_->mutex);
+      pending_blocks_completed = upload_state_->pending_blocks_completed;
+    }
+
+    RETURN_NOT_OK(pending_blocks_completed.status());
+    std::unique_lock<std::mutex> lock(upload_state_->mutex);
+    return CommitBlockList(block_blob_client_, upload_state_->block_ids,
+                           commit_block_list_options_);
   }
 
- private:
-  Status DoAppend(const void* data, int64_t nbytes,
-                  std::shared_ptr<Buffer> owned_buffer = nullptr) {
-    RETURN_NOT_OK(CheckClosed("append"));
-    auto append_data = reinterpret_cast<const uint8_t*>(data);
-    Core::IO::MemoryBodyStream block_content(append_data, nbytes);
-    if (block_content.Length() == 0) {
+  Future<> FlushAsync() {
+    RETURN_NOT_OK(CheckClosed("flush async"));
+    if (!initialised_) {
+      // If the stream has not been successfully initialized then there is nothing to
+      // flush. This also avoids some unhandled errors when flushing in the destructor.
       return Status::OK();
     }
 
-    const auto n_block_ids = block_ids_.size();
+    Future<> pending_blocks_completed;
+    {
+      std::unique_lock<std::mutex> lock(upload_state_->mutex);
+      pending_blocks_completed = upload_state_->pending_blocks_completed;
+    }
+
+    return pending_blocks_completed.Then([self = Self()] {
+      std::unique_lock<std::mutex> lock(self->upload_state_->mutex);
+      return CommitBlockList(self->block_blob_client_, self->upload_state_->block_ids,
+                             self->commit_block_list_options_);
+    });
+  }
+
+ private:
+  Status AppendCurrentBlock() {
+    ARROW_ASSIGN_OR_RAISE(auto buf, current_block_->Finish());
+    current_block_.reset();
+    current_block_size_ = 0;
+    return AppendBlock(buf);
+  }
+
+  Status DoWrite(const void* data, int64_t nbytes,
+                 std::shared_ptr<Buffer> owned_buffer = nullptr) {
+    if (closed_) {
+      return Status::Invalid("Operation on closed stream");
+    }
+
+    const auto* data_ptr = reinterpret_cast<const int8_t*>(data);
+    auto advance_ptr = [this, &data_ptr, &nbytes](const int64_t offset) {
+      data_ptr += offset;
+      nbytes -= offset;
+      pos_ += offset;
+      content_length_ += offset;
+    };
+
+    // Handle case where we have some bytes buffered from prior calls.
+    if (current_block_size_ > 0) {
+      // Try to fill current buffer
+      const int64_t to_copy =
+          std::min(nbytes, kBlockUploadSizeBytes - current_block_size_);
+      RETURN_NOT_OK(current_block_->Write(data_ptr, to_copy));
+      current_block_size_ += to_copy;
+      advance_ptr(to_copy);
+
+      // If buffer isn't full, break
+      if (current_block_size_ < kBlockUploadSizeBytes) {
+        return Status::OK();
+      }
+
+      // Upload current buffer
+      RETURN_NOT_OK(AppendCurrentBlock());
+    }
+
+    // We can upload chunks without copying them into a buffer
+    while (nbytes >= kBlockUploadSizeBytes) {
+      const auto upload_size = std::min(nbytes, kMaxBlockSizeBytes);
+      RETURN_NOT_OK(AppendBlock(data_ptr, upload_size));
+      advance_ptr(upload_size);
+    }
+
+    // Buffer remaining bytes
+    if (nbytes > 0) {
+      current_block_size_ = nbytes;
+
+      if (current_block_ == nullptr) {
+        ARROW_ASSIGN_OR_RAISE(
+            current_block_,
+            io::BufferOutputStream::Create(kBlockUploadSizeBytes, io_context_.pool()));
+      } else {
+        // Re-use the allocation from before.
+        RETURN_NOT_OK(current_block_->Reset(kBlockUploadSizeBytes, io_context_.pool()));
+      }
+
+      RETURN_NOT_OK(current_block_->Write(data_ptr, current_block_size_));
+      pos_ += current_block_size_;
+      content_length_ += current_block_size_;
+    }
+
+    return Status::OK();
+  }
+
+  std::string CreateBlock() {
+    std::unique_lock<std::mutex> lock(upload_state_->mutex);
+    const auto n_block_ids = upload_state_->block_ids.size();
 
     // New block ID must always be distinct from the existing block IDs. Otherwise we
     // will accidentally replace the content of existing blocks, causing corruption.
@@ -1093,36 +1241,106 @@ class ObjectAppendStream final : public io::OutputStream {
     new_block_id.insert(0, required_padding_digits, '0');
     // There is a small risk when appending to a blob created by another client that
     // `new_block_id` may overlapping with an existing block id. Adding the `-arrow`
-    // suffix significantly reduces the risk, but does not 100% eliminate it. For example
-    // if the blob was previously created with one block, with id `00001-arrow` then the
-    // next block we append will conflict with that, and cause corruption.
+    // suffix significantly reduces the risk, but does not 100% eliminate it. For
+    // example if the blob was previously created with one block, with id `00001-arrow`
+    // then the next block we append will conflict with that, and cause corruption.
     new_block_id += "-arrow";
     new_block_id = Core::Convert::Base64Encode(
         std::vector<uint8_t>(new_block_id.begin(), new_block_id.end()));
 
-    try {
-      block_blob_client_->StageBlock(new_block_id, block_content);
-    } catch (const Storage::StorageException& exception) {
-      return ExceptionToStatus(
-          exception, "StageBlock failed for '", block_blob_client_->GetUrl(),
-          "' new_block_id: '", new_block_id,
-          "'. Staging new blocks is fundamental to streaming writes to blob storage.");
+    upload_state_->block_ids.push_back(new_block_id);
+
+    // We only use the future if we have background writes enabled. Without background
+    // writes the future is initialized as finished and not mutated any more.
+    if (background_writes_ && upload_state_->blocks_in_progress++ == 0) {
+      upload_state_->pending_blocks_completed = Future<>::Make();
     }
-    block_ids_.push_back(new_block_id);
-    pos_ += nbytes;
-    content_length_ += nbytes;
+
+    return new_block_id;
+  }
+
+  Status AppendBlock(const void* data, int64_t nbytes,
+                     std::shared_ptr<Buffer> owned_buffer = nullptr) {
+    RETURN_NOT_OK(CheckClosed("append"));
+
+    if (nbytes == 0) {
+      return Status::OK();
+    }
+
+    const auto block_id = CreateBlock();
+
+    if (background_writes_) {
+      if (owned_buffer == nullptr) {
+        ARROW_ASSIGN_OR_RAISE(owned_buffer, AllocateBuffer(nbytes, io_context_.pool()));
+        memcpy(owned_buffer->mutable_data(), data, nbytes);
+      } else {
+        DCHECK_EQ(data, owned_buffer->data());
+        DCHECK_EQ(nbytes, owned_buffer->size());
+      }
+
+      // The closure keeps the buffer and the upload state alive
+      auto deferred = [owned_buffer, block_id, block_blob_client = block_blob_client_,
+                       state = upload_state_]() mutable -> Status {
+        Core::IO::MemoryBodyStream block_content(owned_buffer->data(),
+                                                 owned_buffer->size());
+
+        auto status = StageBlock(block_blob_client.get(), block_id, block_content);
+        HandleUploadOutcome(state, status);
+        return Status::OK();
+      };
+      RETURN_NOT_OK(io::internal::SubmitIO(io_context_, std::move(deferred)));
+    } else {
+      auto append_data = reinterpret_cast<const uint8_t*>(data);
+      Core::IO::MemoryBodyStream block_content(append_data, nbytes);
+
+      RETURN_NOT_OK(StageBlock(block_blob_client_.get(), block_id, block_content));
+    }
+
     return Status::OK();
   }
 
+  Status AppendBlock(std::shared_ptr<Buffer> buffer) {
+    return AppendBlock(buffer->data(), buffer->size(), buffer);
+  }
+
+  static void HandleUploadOutcome(const std::shared_ptr<UploadState>& state,
+                                  const Status& status) {
+    std::unique_lock<std::mutex> lock(state->mutex);
+    if (!status.ok()) {
+      state->status &= status;
+    }
+    // Notify completion
+    if (--state->blocks_in_progress == 0) {
+      auto fut = state->pending_blocks_completed;
+      lock.unlock();
+      fut.MarkFinished(state->status);
+    }
+  }
+
   std::shared_ptr<Blobs::BlockBlobClient> block_blob_client_;
   const io::IOContext io_context_;
   const AzureLocation location_;
+  const bool background_writes_;
   int64_t content_length_ = kNoSize;
 
+  std::shared_ptr<io::BufferOutputStream> current_block_;
+  int64_t current_block_size_ = 0;
+
   bool closed_ = false;
   bool initialised_ = false;
   int64_t pos_ = 0;
-  std::vector<std::string> block_ids_;
+
+  // This struct is kept alive through background writes to avoid problems
+  // in the completion handler.
+  struct UploadState {
+    std::mutex mutex;
+    std::vector<std::string> block_ids;
+    int64_t blocks_in_progress = 0;
+    Status status;
+    Future<> pending_blocks_completed = Future<>::MakeFinished(Status::OK());
+  };
+  std::shared_ptr<UploadState> upload_state_;
+
   Blobs::CommitBlockListOptions commit_block_list_options_;
 };
 
diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h
index 072b061eeb2..ebbe00c4ee7 100644
--- a/cpp/src/arrow/filesystem/azurefs.h
+++ b/cpp/src/arrow/filesystem/azurefs.h
@@ -112,6 +112,9 @@ struct ARROW_EXPORT AzureOptions {
   /// This will be ignored if non-empty metadata is passed to OpenOutputStream.
   std::shared_ptr<const KeyValueMetadata> default_metadata;
 
+  /// Whether OutputStream writes will be issued in the background, without blocking.
+  bool background_writes = true;
+
  private:
   enum class CredentialKind {
     kDefault,
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc
index 5ff241b17ff..9d437d1f83a 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -39,6 +39,7 @@
 #include <memory>
 #include <random>
 #include <string>
+#include <vector>
 
 #include <gmock/gmock-matchers.h>
 #include <gmock/gmock-more-matchers.h>
@@ -53,6 +54,7 @@
 #include "arrow/status.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/util.h"
+#include "arrow/util/future.h"
 #include "arrow/util/io_util.h"
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging.h"
@@ -566,6 +568,7 @@ class TestAzureOptions : public ::testing::Test {
     ASSERT_EQ(options.dfs_storage_scheme, default_options.dfs_storage_scheme);
     ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kDefault);
     ASSERT_EQ(path, "container/dir/blob");
+    ASSERT_EQ(options.background_writes, true);
   }
 
   void TestFromUriDfsStorage() {
@@ -582,6 +585,7 @@ class TestAzureOptions : public ::testing::Test {
     ASSERT_EQ(options.dfs_storage_scheme, default_options.dfs_storage_scheme);
     ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kDefault);
     ASSERT_EQ(path, "file_system/dir/file");
+    ASSERT_EQ(options.background_writes, true);
   }
 
   void TestFromUriAbfs() {
@@ -597,6 +601,7 @@ class TestAzureOptions : public ::testing::Test {
     ASSERT_EQ(options.dfs_storage_scheme, "https");
     ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kStorageSharedKey);
     ASSERT_EQ(path, "container/dir/blob");
+    ASSERT_EQ(options.background_writes, true);
   }
 
   void TestFromUriAbfss() {
@@ -612,6 +617,7 @@ class TestAzureOptions : public ::testing::Test {
     ASSERT_EQ(options.dfs_storage_scheme, "https");
     ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kStorageSharedKey);
     ASSERT_EQ(path, "container/dir/blob");
+    ASSERT_EQ(options.background_writes, true);
   }
 
   void TestFromUriEnableTls() {
@@ -628,6 +634,17 @@ class TestAzureOptions : public ::testing::Test {
     ASSERT_EQ(options.dfs_storage_scheme, "http");
     ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kStorageSharedKey);
     ASSERT_EQ(path, "container/dir/blob");
+    ASSERT_EQ(options.background_writes, true);
+  }
+
+  void TestFromUriDisableBackgroundWrites() {
+    std::string path;
+    ASSERT_OK_AND_ASSIGN(auto options,
+                         AzureOptions::FromUri(
+                             "abfs://account:password@127.0.0.1:10000/container/dir/blob?"
+                             "background_writes=false",
+                             &path));
+    ASSERT_EQ(options.background_writes, false);
   }
 
   void TestFromUriCredentialDefault() {
@@ -773,6 +790,9 @@ TEST_F(TestAzureOptions, FromUriDfsStorage) { TestFromUriDfsStorage(); }
 TEST_F(TestAzureOptions, FromUriAbfs) { TestFromUriAbfs(); }
 TEST_F(TestAzureOptions, FromUriAbfss) { TestFromUriAbfss(); }
 TEST_F(TestAzureOptions, FromUriEnableTls) { TestFromUriEnableTls(); }
+TEST_F(TestAzureOptions, FromUriDisableBackgroundWrites) {
+  TestFromUriDisableBackgroundWrites();
+}
 TEST_F(TestAzureOptions, FromUriCredentialDefault) { TestFromUriCredentialDefault(); }
 TEST_F(TestAzureOptions, FromUriCredentialAnonymous) { TestFromUriCredentialAnonymous(); }
 TEST_F(TestAzureOptions, FromUriCredentialStorageSharedKey) {
@@ -929,8 +949,9 @@ class TestAzureFileSystem : public ::testing::Test {
   void UploadLines(const std::vector<std::string>& lines, const std::string& path,
                    int total_size) {
     ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {}));
-    const auto all_lines = std::accumulate(lines.begin(), lines.end(), std::string(""));
-    ASSERT_OK(output->Write(all_lines));
+    for (auto const& line : lines) {
+      ASSERT_OK(output->Write(line.data(), line.size()));
+    }
     ASSERT_OK(output->Close());
   }
 
@@ -1474,6 +1495,162 @@ class TestAzureFileSystem : public ::testing::Test {
     arrow::fs::AssertFileInfo(fs(), data.Path("dir/file0"), FileType::File);
   }
 
+  void AssertObjectContents(AzureFileSystem* fs, std::string_view path,
+                            std::string_view expected) {
+    ASSERT_OK_AND_ASSIGN(auto input, fs->OpenInputStream(std::string{path}));
+    std::string contents;
+    std::shared_ptr<Buffer> buffer;
+    do {
+      ASSERT_OK_AND_ASSIGN(buffer, input->Read(128 * 1024));
+      contents.append(buffer->ToString());
+    } while (buffer->size() != 0);
+
+    EXPECT_EQ(expected, contents);
+  }
+
+  void TestOpenOutputStreamSmall() {
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+
+    auto data = SetUpPreexistingData();
+    const auto path = data.ContainerPath("test-write-object");
+    ASSERT_OK_AND_ASSIGN(auto output, fs->OpenOutputStream(path, {}));
+    const std::string_view expected(PreexistingData::kLoremIpsum);
+    ASSERT_OK(output->Write(expected));
+    ASSERT_OK(output->Close());
+
+    // Verify we can read the object back.
+    AssertObjectContents(fs.get(), path, expected);
+  }
+
+  void TestOpenOutputStreamLarge() {
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+
+    auto data = SetUpPreexistingData();
+    const auto path = data.ContainerPath("test-write-object");
+    ASSERT_OK_AND_ASSIGN(auto output, fs->OpenOutputStream(path, {}));
+
+    // Upload 5 MB, 4 MB und 2 MB and a very small write to test varying sizes
+    std::vector<std::int64_t> sizes{5 * 1024 * 1024, 4 * 1024 * 1024, 2 * 1024 * 1024,
+                                    2000};
+
+    std::vector<std::string> buffers{};
+    char current_char = 'A';
+    for (const auto size : sizes) {
+      buffers.emplace_back(size, current_char++);
+    }
+
+    auto expected_size = std::int64_t{0};
+    for (size_t i = 0; i < buffers.size(); ++i) {
+      ASSERT_OK(output->Write(buffers[i]));
+      expected_size += sizes[i];
+      ASSERT_EQ(expected_size, output->Tell());
+    }
+    ASSERT_OK(output->Close());
+
+    AssertObjectContents(fs.get(), path,
+                         buffers[0] + buffers[1] + buffers[2] + buffers[3]);
+  }
+
+  void TestOpenOutputStreamLargeSingleWrite() {
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+
+    auto data = SetUpPreexistingData();
+    const auto path = data.ContainerPath("test-write-object");
+    ASSERT_OK_AND_ASSIGN(auto output, fs->OpenOutputStream(path, {}));
+
+    constexpr std::int64_t size{12 * 1024 * 1024};
+    const std::string large_string(size, 'X');
+
+    ASSERT_OK(output->Write(large_string));
+    ASSERT_EQ(size, output->Tell());
+    ASSERT_OK(output->Close());
+
+    AssertObjectContents(fs.get(), path, large_string);
+  }
+
+  void TestOpenOutputStreamCloseAsync() {
+#if defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND)
+    // This false positive leak is similar to the one pinpointed in the
+    // have_false_positive_memory_leak_with_generator() comments above,
+    // though the stack trace is different. It happens when a block list
+    // is committed from a background thread.
+    //
+    // clang-format off
+    // Direct leak of 968 byte(s) in 1 object(s) allocated from:
+    //   #0 calloc
+    //   #1 (/lib/x86_64-linux-gnu/libxml2.so.2+0xe25a4)
+    //   #2 __xmlDefaultBufferSize
+    //   #3 xmlBufferCreate
+    //   #4 Azure::Storage::_internal::XmlWriter::XmlWriter()
+    //   #5 Azure::Storage::Blobs::_detail::BlockBlobClient::CommitBlockList
+    //   #6 Azure::Storage::Blobs::BlockBlobClient::CommitBlockList
+    //   #7 arrow::fs::(anonymous namespace)::CommitBlockList
+    //   #8 arrow::fs::(anonymous namespace)::ObjectAppendStream::FlushAsync()::'lambda'
+    // clang-format on
+    //
+    // TODO perhaps remove this skip once we can rely on
+    // https://github.com/Azure/azure-sdk-for-cpp/pull/5767
+    //
+    // Also note that ClickHouse has a workaround for a similar issue:
+    // https://github.com/ClickHouse/ClickHouse/pull/45796
+    if (options_.background_writes) {
+      GTEST_SKIP() << "False positive memory leak in libxml2 with CloseAsync";
+    }
+#endif
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+    auto data = SetUpPreexistingData();
+    const std::string path = data.ContainerPath("test-write-object");
+    constexpr auto payload = PreexistingData::kLoremIpsum;
+
+    ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenOutputStream(path));
+    ASSERT_OK(stream->Write(payload));
+    auto close_fut = stream->CloseAsync();
+
+    ASSERT_OK(close_fut.MoveResult());
+
+    AssertObjectContents(fs.get(), path, payload);
+  }
+
+  void TestOpenOutputStreamCloseAsyncDestructor() {
+#if defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND)
+    // See above.
+    if (options_.background_writes) {
+      GTEST_SKIP() << "False positive memory leak in libxml2 with CloseAsync";
+    }
+#endif
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+    auto data = SetUpPreexistingData();
+    const std::string path = data.ContainerPath("test-write-object");
+    constexpr auto payload = PreexistingData::kLoremIpsum;
+
+    ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenOutputStream(path));
+    ASSERT_OK(stream->Write(payload));
+    // Destructor implicitly closes stream and completes the upload.
+    // Testing it doesn't matter whether flush is triggered asynchronously
+    // after CloseAsync or synchronously after stream.reset() since we're just
+    // checking that the future keeps the stream alive until completion
+    // rather than segfaulting on a dangling stream.
+    auto close_fut = stream->CloseAsync();
+    stream.reset();
+    ASSERT_OK(close_fut.MoveResult());
+
+    AssertObjectContents(fs.get(), path, payload);
+  }
+
+  void TestOpenOutputStreamDestructor() {
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+    constexpr auto* payload = "new data";
+    auto data = SetUpPreexistingData();
+    const std::string path = data.ContainerPath("test-write-object");
+
+    ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenOutputStream(path));
+    ASSERT_OK(stream->Write(payload));
+    // Destructor implicitly closes stream and completes the multipart upload.
+    stream.reset();
+
+    AssertObjectContents(fs.get(), path, payload);
+  }
+
  private:
   using StringMatcher =
       ::testing::PolymorphicMatcher<::testing::internal::HasSubstrMatcher<std::string>>;
@@ -2704,53 +2881,27 @@ TEST_F(TestAzuriteFileSystem, WriteMetadataHttpHeaders) {
   ASSERT_EQ("text/plain", content_type);
 }
 
-TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) {
-  auto data = SetUpPreexistingData();
-  const auto path = data.ContainerPath("test-write-object");
-  ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {}));
-  const std::string_view expected(PreexistingData::kLoremIpsum);
-  ASSERT_OK(output->Write(expected));
-  ASSERT_OK(output->Close());
-
-  // Verify we can read the object back.
-  ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path));
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmallNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamSmall();
+}
 
-  std::array<char, 1024> inbuf{};
-  ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data()));
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) { TestOpenOutputStreamSmall(); }
 
-  EXPECT_EQ(expected, std::string_view(inbuf.data(), size));
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamLargeNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamLarge();
 }
 
-TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) {
-  auto data = SetUpPreexistingData();
-  const auto path = data.ContainerPath("test-write-object");
-  ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {}));
-  std::array<std::int64_t, 3> sizes{257 * 1024, 258 * 1024, 259 * 1024};
-  std::array<std::string, 3> buffers{
-      std::string(sizes[0], 'A'),
-      std::string(sizes[1], 'B'),
-      std::string(sizes[2], 'C'),
-  };
-  auto expected = std::int64_t{0};
-  for (auto i = 0; i != 3; ++i) {
-    ASSERT_OK(output->Write(buffers[i]));
-    expected += sizes[i];
-    ASSERT_EQ(expected, output->Tell());
-  }
-  ASSERT_OK(output->Close());
-
-  // Verify we can read the object back.
-  ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path));
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { TestOpenOutputStreamLarge(); }
 
-  std::string contents;
-  std::shared_ptr<Buffer> buffer;
-  do {
-    ASSERT_OK_AND_ASSIGN(buffer, input->Read(128 * 1024));
-    ASSERT_TRUE(buffer);
-    contents.append(buffer->ToString());
-  } while (buffer->size() != 0);
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamLargeSingleWriteNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamLargeSingleWrite();
+}
 
-  EXPECT_EQ(contents, buffers[0] + buffers[1] + buffers[2]);
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamLargeSingleWrite) {
+  TestOpenOutputStreamLargeSingleWrite();
 }
 
 TEST_F(TestAzuriteFileSystem, OpenOutputStreamTruncatesExistingFile) {
@@ -2820,6 +2971,33 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamClosed) {
   ASSERT_RAISES(Invalid, output->Tell());
 }
 
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamCloseAsync) {
+  TestOpenOutputStreamCloseAsync();
+}
+
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamCloseAsyncNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamCloseAsync();
+}
+
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamAsyncDestructor) {
+  TestOpenOutputStreamCloseAsyncDestructor();
+}
+
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamAsyncDestructorNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamCloseAsyncDestructor();
+}
+
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamDestructor) {
+  TestOpenOutputStreamDestructor();
+}
+
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamDestructorNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamDestructor();
+}
+
 TEST_F(TestAzuriteFileSystem, OpenOutputStreamUri) {
   auto data = SetUpPreexistingData();
   const auto path = data.ContainerPath("open-output-stream-uri.txt");

From ffee537d88ab6d26614e2a1e85d4d18152695020 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 21 Aug 2024 14:18:45 +0200
Subject: [PATCH 051/157] GH-42222: [Python] Add bindings for CopyTo on
 RecordBatch and Array classes (#42223)

### Rationale for this change

We have added bindings for the Device and MemoryManager classes (https://github.com/apache/arrow/issues/41126), and as a next step we can expose the functionality to copy a full Array or RecordBatch to a specific memory manager.

### What changes are included in this PR?

This adds a `copy_to` method on pyarrow Array and RecordBatch.

### Are these changes tested?

Yes

* GitHub Issue: #42222

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/array.pxi             | 36 ++++++++++++
 python/pyarrow/device.pxi            |  6 ++
 python/pyarrow/includes/libarrow.pxd |  4 ++
 python/pyarrow/lib.pxd               |  4 ++
 python/pyarrow/table.pxi             | 35 ++++++++++++
 python/pyarrow/tests/test_cuda.py    | 82 +++++++++++-----------------
 python/pyarrow/tests/test_device.py  | 26 +++++++++
 7 files changed, 143 insertions(+), 50 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 4c3eb932326..77d6c9c06d2 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1702,6 +1702,42 @@ cdef class Array(_PandasConvertible):
         _append_array_buffers(self.sp_array.get().data().get(), res)
         return res
 
+    def copy_to(self, destination):
+        """
+        Construct a copy of the array with all buffers on destination
+        device.
+
+        This method recursively copies the array's buffers and those of its
+        children onto the destination MemoryManager device and returns the
+        new Array.
+
+        Parameters
+        ----------
+        destination : pyarrow.MemoryManager or pyarrow.Device
+            The destination device to copy the array to.
+
+        Returns
+        -------
+        Array
+        """
+        cdef:
+            shared_ptr[CArray] c_array
+            shared_ptr[CMemoryManager] c_memory_manager
+
+        if isinstance(destination, Device):
+            c_memory_manager = (<Device>destination).unwrap().get().default_memory_manager()
+        elif isinstance(destination, MemoryManager):
+            c_memory_manager = (<MemoryManager>destination).unwrap()
+        else:
+            raise TypeError(
+                "Argument 'destination' has incorrect type (expected a "
+                f"pyarrow Device or MemoryManager, got {type(destination)})"
+            )
+
+        with nogil:
+            c_array = GetResultValue(self.ap.CopyTo(c_memory_manager))
+        return pyarrow_wrap_array(c_array)
+
     def _export_to_c(self, out_ptr, out_schema_ptr=0):
         """
         Export to a C ArrowArray struct, given its pointer.
diff --git a/python/pyarrow/device.pxi b/python/pyarrow/device.pxi
index 6e603475208..26256de6209 100644
--- a/python/pyarrow/device.pxi
+++ b/python/pyarrow/device.pxi
@@ -64,6 +64,9 @@ cdef class Device(_Weakrefable):
         self.init(device)
         return self
 
+    cdef inline shared_ptr[CDevice] unwrap(self) nogil:
+        return self.device
+
     def __eq__(self, other):
         if not isinstance(other, Device):
             return False
@@ -130,6 +133,9 @@ cdef class MemoryManager(_Weakrefable):
         self.init(mm)
         return self
 
+    cdef inline shared_ptr[CMemoryManager] unwrap(self) nogil:
+        return self.memory_manager
+
     def __repr__(self):
         return "<pyarrow.MemoryManager device: {}>".format(
             frombytes(self.memory_manager.get().device().get().ToString())
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index a54a1db292f..6f510cfc0c0 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -234,7 +234,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         CStatus Validate() const
         CStatus ValidateFull() const
         CResult[shared_ptr[CArray]] View(const shared_ptr[CDataType]& type)
+
         CDeviceAllocationType device_type()
+        CResult[shared_ptr[CArray]] CopyTo(const shared_ptr[CMemoryManager]& to) const
 
     shared_ptr[CArray] MakeArray(const shared_ptr[CArrayData]& data)
     CResult[shared_ptr[CArray]] MakeArrayOfNull(
@@ -1027,6 +1029,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         shared_ptr[CRecordBatch] Slice(int64_t offset)
         shared_ptr[CRecordBatch] Slice(int64_t offset, int64_t length)
 
+        CResult[shared_ptr[CRecordBatch]] CopyTo(const shared_ptr[CMemoryManager]& to) const
+
         CResult[shared_ptr[CTensor]] ToTensor(c_bool null_to_nan, c_bool row_major,
                                               CMemoryPool* pool) const
 
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index e3625c18152..a7c3b496a00 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -542,6 +542,8 @@ cdef class Device(_Weakrefable):
     @staticmethod
     cdef wrap(const shared_ptr[CDevice]& device)
 
+    cdef inline shared_ptr[CDevice] unwrap(self) nogil
+
 
 cdef class MemoryManager(_Weakrefable):
     cdef:
@@ -552,6 +554,8 @@ cdef class MemoryManager(_Weakrefable):
     @staticmethod
     cdef wrap(const shared_ptr[CMemoryManager]& mm)
 
+    cdef inline shared_ptr[CMemoryManager] unwrap(self) nogil
+
 
 cdef class Buffer(_Weakrefable):
     cdef:
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 8f7c44e55dc..6d34c71c9df 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -3569,6 +3569,41 @@ cdef class RecordBatch(_Tabular):
                                                                              row_major, pool))
         return pyarrow_wrap_tensor(c_tensor)
 
+    def copy_to(self, destination):
+        """
+        Copy the entire RecordBatch to destination device.
+
+        This copies each column of the record batch to create
+        a new record batch where all underlying buffers for the columns have
+        been copied to the destination MemoryManager.
+
+        Parameters
+        ----------
+        destination : pyarrow.MemoryManager or pyarrow.Device
+            The destination device to copy the array to.
+
+        Returns
+        -------
+        RecordBatch
+        """
+        cdef:
+            shared_ptr[CRecordBatch] c_batch
+            shared_ptr[CMemoryManager] c_memory_manager
+
+        if isinstance(destination, Device):
+            c_memory_manager = (<Device>destination).unwrap().get().default_memory_manager()
+        elif isinstance(destination, MemoryManager):
+            c_memory_manager = (<MemoryManager>destination).unwrap()
+        else:
+            raise TypeError(
+                "Argument 'destination' has incorrect type (expected a "
+                f"pyarrow Device or MemoryManager, got {type(destination)})"
+            )
+
+        with nogil:
+            c_batch = GetResultValue(self.batch.CopyTo(c_memory_manager))
+        return pyarrow_wrap_batch(c_batch)
+
     def _export_to_c(self, out_ptr, out_schema_ptr=0):
         """
         Export to a C ArrowArray struct, given its pointer.
diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py
index 36b97a62064..d55be651b15 100644
--- a/python/pyarrow/tests/test_cuda.py
+++ b/python/pyarrow/tests/test_cuda.py
@@ -827,21 +827,29 @@ def test_IPC(size):
     assert p.exitcode == 0
 
 
-def _arr_copy_to_host(carr):
-    # TODO replace below with copy to device when exposed in python
-    buffers = []
-    for cbuf in carr.buffers():
-        if cbuf is None:
-            buffers.append(None)
-        else:
-            buf = global_context.foreign_buffer(
-                cbuf.address, cbuf.size, cbuf
-            ).copy_to_host()
-            buffers.append(buf)
-
-    child = pa.Array.from_buffers(carr.type.value_type, 3, buffers[2:])
-    new = pa.Array.from_buffers(carr.type, 2, buffers[:2], children=[child])
-    return new
+def test_copy_to():
+    _, buf = make_random_buffer(size=10, target='device')
+    mm_cuda = buf.memory_manager
+
+    for dest in [mm_cuda, mm_cuda.device]:
+        arr = pa.array([0, 1, 2])
+        arr_cuda = arr.copy_to(dest)
+        assert not arr_cuda.buffers()[1].is_cpu
+        assert arr_cuda.buffers()[1].device_type == pa.DeviceAllocationType.CUDA
+        assert arr_cuda.buffers()[1].device == mm_cuda.device
+
+        arr_roundtrip = arr_cuda.copy_to(pa.default_cpu_memory_manager())
+        assert arr_roundtrip.equals(arr)
+
+        batch = pa.record_batch({"col": arr})
+        batch_cuda = batch.copy_to(dest)
+        buf_cuda = batch_cuda["col"].buffers()[1]
+        assert not buf_cuda.is_cpu
+        assert buf_cuda.device_type == pa.DeviceAllocationType.CUDA
+        assert buf_cuda.device == mm_cuda.device
+
+        batch_roundtrip = batch_cuda.copy_to(pa.default_cpu_memory_manager())
+        assert batch_roundtrip.equals(batch)
 
 
 def test_device_interface_array():
@@ -856,19 +864,10 @@ def test_device_interface_array():
     typ = pa.list_(pa.int32())
     arr = pa.array([[1], [2, 42]], type=typ)
 
-    # TODO replace below with copy to device when exposed in python
-    cbuffers = []
-    for buf in arr.buffers():
-        if buf is None:
-            cbuffers.append(None)
-        else:
-            cbuf = global_context.new_buffer(buf.size)
-            cbuf.copy_from_host(buf, position=0, nbytes=buf.size)
-            cbuffers.append(cbuf)
-
-    carr = pa.Array.from_buffers(typ, 2, cbuffers[:2], children=[
-        pa.Array.from_buffers(typ.value_type, 3, cbuffers[2:])
-    ])
+    # copy to device
+    _, buf = make_random_buffer(size=10, target='device')
+    mm_cuda = buf.memory_manager
+    carr = arr.copy_to(mm_cuda)
 
     # Type is known up front
     carr._export_to_c_device(ptr_array)
@@ -882,7 +881,7 @@ def test_device_interface_array():
     del carr
     carr_new = pa.Array._import_from_c_device(ptr_array, typ)
     assert carr_new.type == pa.list_(pa.int32())
-    arr_new = _arr_copy_to_host(carr_new)
+    arr_new = carr_new.copy_to(pa.default_cpu_memory_manager())
     assert arr_new.equals(arr)
 
     del carr_new
@@ -891,15 +890,13 @@ def test_device_interface_array():
         pa.Array._import_from_c_device(ptr_array, typ)
 
     # Schema is exported and imported at the same time
-    carr = pa.Array.from_buffers(typ, 2, cbuffers[:2], children=[
-        pa.Array.from_buffers(typ.value_type, 3, cbuffers[2:])
-    ])
+    carr = arr.copy_to(mm_cuda)
     carr._export_to_c_device(ptr_array, ptr_schema)
     # Delete and recreate C++ objects from exported pointers
     del carr
     carr_new = pa.Array._import_from_c_device(ptr_array, ptr_schema)
     assert carr_new.type == pa.list_(pa.int32())
-    arr_new = _arr_copy_to_host(carr_new)
+    arr_new = carr_new.copy_to(pa.default_cpu_memory_manager())
     assert arr_new.equals(arr)
 
     del carr_new
@@ -908,21 +905,6 @@ def test_device_interface_array():
         pa.Array._import_from_c_device(ptr_array, ptr_schema)
 
 
-def _batch_copy_to_host(cbatch):
-    # TODO replace below with copy to device when exposed in python
-    arrs = []
-    for col in cbatch.columns:
-        buffers = [
-            global_context.foreign_buffer(buf.address, buf.size, buf).copy_to_host()
-            if buf is not None else None
-            for buf in col.buffers()
-        ]
-        new = pa.Array.from_buffers(col.type, len(col), buffers)
-        arrs.append(new)
-
-    return pa.RecordBatch.from_arrays(arrs, schema=cbatch.schema)
-
-
 def test_device_interface_batch_array():
     cffi = pytest.importorskip("pyarrow.cffi")
     ffi = cffi.ffi
@@ -949,7 +931,7 @@ def test_device_interface_batch_array():
     del cbatch
     cbatch_new = pa.RecordBatch._import_from_c_device(ptr_array, schema)
     assert cbatch_new.schema == schema
-    batch_new = _batch_copy_to_host(cbatch_new)
+    batch_new = cbatch_new.copy_to(pa.default_cpu_memory_manager())
     assert batch_new.equals(batch)
 
     del cbatch_new
@@ -964,7 +946,7 @@ def test_device_interface_batch_array():
     del cbatch
     cbatch_new = pa.RecordBatch._import_from_c_device(ptr_array, ptr_schema)
     assert cbatch_new.schema == schema
-    batch_new = _batch_copy_to_host(cbatch_new)
+    batch_new = cbatch_new.copy_to(pa.default_cpu_memory_manager())
     assert batch_new.equals(batch)
 
     del cbatch_new
diff --git a/python/pyarrow/tests/test_device.py b/python/pyarrow/tests/test_device.py
index 6bdb015be1a..dc1a51e6d00 100644
--- a/python/pyarrow/tests/test_device.py
+++ b/python/pyarrow/tests/test_device.py
@@ -17,6 +17,8 @@
 
 import pyarrow as pa
 
+import pytest
+
 
 def test_device_memory_manager():
     mm = pa.default_cpu_memory_manager()
@@ -41,3 +43,27 @@ def test_buffer_device():
     assert buf.device.is_cpu
     assert buf.device == pa.default_cpu_memory_manager().device
     assert buf.memory_manager.is_cpu
+
+
+def test_copy_to():
+    mm = pa.default_cpu_memory_manager()
+
+    arr = pa.array([0, 1, 2])
+    batch = pa.record_batch({"col": arr})
+
+    for dest in [mm, mm.device]:
+        arr_copied = arr.copy_to(dest)
+        assert arr_copied.equals(arr)
+        assert arr_copied.buffers()[1].device == mm.device
+        assert arr_copied.buffers()[1].address != arr.buffers()[1].address
+
+        batch_copied = batch.copy_to(dest)
+        assert batch_copied.equals(batch)
+        assert batch_copied["col"].buffers()[1].device == mm.device
+        assert batch_copied["col"].buffers()[1].address != arr.buffers()[1].address
+
+    with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"):
+        arr.copy_to(mm.device.device_type)
+
+    with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"):
+        batch.copy_to(mm.device.device_type)

From f9911ee2ffc62fa946b2e1198bcdd13a757181fe Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 21 Aug 2024 14:37:47 +0200
Subject: [PATCH 052/157] GH-43776: [C++] Add chunked Take benchmarks with a
 small selection factor (#43772)

This should help exercise the performance of chunked Take implementation on more use cases.

* GitHub Issue: #43776

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../kernels/vector_selection_benchmark.cc     | 91 ++++++++++++++++---
 1 file changed, 80 insertions(+), 11 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
index c2a27dfe434..75affd32560 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
@@ -17,6 +17,7 @@
 
 #include "benchmark/benchmark.h"
 
+#include <cmath>
 #include <cstdint>
 #include <sstream>
 
@@ -42,6 +43,9 @@ struct FilterParams {
   const double filter_null_proportion;
 };
 
+constexpr double kDefaultTakeSelectionFactor = 1.0;
+constexpr double kSmallTakeSelectionFactor = 0.05;
+
 std::vector<int64_t> g_data_sizes = {kL2Size};
 
 // The benchmark state parameter references this vector of cases. Test high and
@@ -104,14 +108,21 @@ struct TakeBenchmark {
   benchmark::State& state;
   RegressionArgs args;
   random::RandomArrayGenerator rand;
+  double selection_factor;
   bool indices_have_nulls;
   bool monotonic_indices = false;
 
   TakeBenchmark(benchmark::State& state, bool indices_have_nulls,
                 bool monotonic_indices = false)
+      : TakeBenchmark(state, /*selection_factor=*/kDefaultTakeSelectionFactor,
+                      indices_have_nulls, monotonic_indices) {}
+
+  TakeBenchmark(benchmark::State& state, double selection_factor, bool indices_have_nulls,
+                bool monotonic_indices = false)
       : state(state),
         args(state, /*size_is_bytes=*/false),
         rand(kSeed),
+        selection_factor(selection_factor),
         indices_have_nulls(indices_have_nulls),
         monotonic_indices(monotonic_indices) {}
 
@@ -185,10 +196,10 @@ struct TakeBenchmark {
   }
 
   void Bench(const std::shared_ptr<Array>& values) {
-    double indices_null_proportion = indices_have_nulls ? args.null_proportion : 0;
-    auto indices =
-        rand.Int32(values->length(), 0, static_cast<int32_t>(values->length() - 1),
-                   indices_null_proportion);
+    const double indices_null_proportion = indices_have_nulls ? args.null_proportion : 0;
+    const int64_t num_indices = static_cast<int64_t>(selection_factor * values->length());
+    auto indices = rand.Int32(num_indices, 0, static_cast<int32_t>(values->length() - 1),
+                              indices_null_proportion);
 
     if (monotonic_indices) {
       auto arg_sorter = *SortIndices(*indices);
@@ -198,14 +209,15 @@ struct TakeBenchmark {
     for (auto _ : state) {
       ABORT_NOT_OK(Take(values, indices).status());
     }
-    state.SetItemsProcessed(state.iterations() * values->length());
+    state.SetItemsProcessed(state.iterations() * num_indices);
+    state.counters["selection_factor"] = selection_factor;
   }
 
   void BenchChunked(const std::shared_ptr<ChunkedArray>& values, bool chunk_indices_too) {
     double indices_null_proportion = indices_have_nulls ? args.null_proportion : 0;
-    auto indices =
-        rand.Int32(values->length(), 0, static_cast<int32_t>(values->length() - 1),
-                   indices_null_proportion);
+    const int64_t num_indices = static_cast<int64_t>(selection_factor * values->length());
+    auto indices = rand.Int32(num_indices, 0, static_cast<int32_t>(values->length() - 1),
+                              indices_null_proportion);
 
     if (monotonic_indices) {
       auto arg_sorter = *SortIndices(*indices);
@@ -213,14 +225,26 @@ struct TakeBenchmark {
     }
     std::shared_ptr<ChunkedArray> chunked_indices;
     if (chunk_indices_too) {
+      // Here we choose for indices chunks to have roughly the same length
+      // as values chunks, but there may be less of them if selection_factor < 1.0.
+      // The alternative is to have the same number of chunks, but with a potentially
+      // much smaller (and irrealistic) length.
       std::vector<std::shared_ptr<Array>> indices_chunks;
+      // Make sure there are at least two chunks of indices
+      const auto max_chunk_length = indices->length() / 2 + 1;
       int64_t offset = 0;
       for (int i = 0; i < values->num_chunks(); ++i) {
-        auto chunk = indices->Slice(offset, values->chunk(i)->length());
+        const auto chunk_length = std::min(max_chunk_length, values->chunk(i)->length());
+        auto chunk = indices->Slice(offset, chunk_length);
         indices_chunks.push_back(std::move(chunk));
-        offset += values->chunk(i)->length();
+        offset += chunk_length;
+        if (offset >= indices->length()) {
+          break;
+        }
       }
       chunked_indices = std::make_shared<ChunkedArray>(std::move(indices_chunks));
+      ARROW_CHECK_EQ(chunked_indices->length(), num_indices);
+      ARROW_CHECK_GT(chunked_indices->num_chunks(), 1);
     }
 
     if (chunk_indices_too) {
@@ -232,7 +256,8 @@ struct TakeBenchmark {
         ABORT_NOT_OK(Take(values, indices).status());
       }
     }
-    state.SetItemsProcessed(state.iterations() * values->length());
+    state.SetItemsProcessed(state.iterations() * num_indices);
+    state.counters["selection_factor"] = selection_factor;
   }
 };
 
@@ -432,12 +457,25 @@ static void TakeChunkedChunkedInt64RandomIndicesWithNulls(benchmark::State& stat
       .ChunkedInt64(/*num_chunks=*/100, /*chunk_indices_too=*/true);
 }
 
+static void TakeChunkedChunkedInt64FewRandomIndicesWithNulls(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/true)
+      .ChunkedInt64(/*num_chunks=*/100, /*chunk_indices_too=*/true);
+}
+
 static void TakeChunkedChunkedInt64MonotonicIndices(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true)
       .ChunkedInt64(
           /*num_chunks=*/100, /*chunk_indices_too=*/true);
 }
 
+static void TakeChunkedChunkedInt64FewMonotonicIndices(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/false, /*monotonic=*/true)
+      .ChunkedInt64(
+          /*num_chunks=*/100, /*chunk_indices_too=*/true);
+}
+
 static void TakeChunkedChunkedFSBRandomIndicesNoNulls(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false)
       .ChunkedFSB(/*num_chunks=*/100, /*chunk_indices_too=*/true);
@@ -463,11 +501,23 @@ static void TakeChunkedChunkedStringRandomIndicesWithNulls(benchmark::State& sta
       .ChunkedString(/*num_chunks=*/100, /*chunk_indices_too=*/true);
 }
 
+static void TakeChunkedChunkedStringFewRandomIndicesWithNulls(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/true)
+      .ChunkedString(/*num_chunks=*/100, /*chunk_indices_too=*/true);
+}
+
 static void TakeChunkedChunkedStringMonotonicIndices(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true)
       .ChunkedString(/*num_chunks=*/100, /*chunk_indices_too=*/true);
 }
 
+static void TakeChunkedChunkedStringFewMonotonicIndices(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/false, /*monotonic=*/true)
+      .ChunkedString(/*num_chunks=*/100, /*chunk_indices_too=*/true);
+}
+
 static void TakeChunkedFlatInt64RandomIndicesNoNulls(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false)
       .ChunkedInt64(/*num_chunks=*/100, /*chunk_indices_too=*/false);
@@ -478,12 +528,25 @@ static void TakeChunkedFlatInt64RandomIndicesWithNulls(benchmark::State& state)
       .ChunkedInt64(/*num_chunks=*/100, /*chunk_indices_too=*/false);
 }
 
+static void TakeChunkedFlatInt64FewRandomIndicesWithNulls(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/true)
+      .ChunkedInt64(/*num_chunks=*/100, /*chunk_indices_too=*/false);
+}
+
 static void TakeChunkedFlatInt64MonotonicIndices(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true)
       .ChunkedInt64(
           /*num_chunks=*/100, /*chunk_indices_too=*/false);
 }
 
+static void TakeChunkedFlatInt64FewMonotonicIndices(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/false, /*monotonic=*/true)
+      .ChunkedInt64(
+          /*num_chunks=*/100, /*chunk_indices_too=*/false);
+}
+
 void FilterSetArgs(benchmark::internal::Benchmark* bench) {
   for (int64_t size : g_data_sizes) {
     for (int i = 0; i < static_cast<int>(g_filter_params.size()); ++i) {
@@ -560,18 +623,24 @@ BENCHMARK(TakeStringMonotonicIndices)->Apply(TakeSetArgs);
 // Chunked values x Chunked indices
 BENCHMARK(TakeChunkedChunkedInt64RandomIndicesNoNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedChunkedInt64RandomIndicesWithNulls)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedChunkedInt64FewRandomIndicesWithNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedChunkedInt64MonotonicIndices)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedChunkedInt64FewMonotonicIndices)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedChunkedFSBRandomIndicesNoNulls)->Apply(TakeFSBSetArgs);
 BENCHMARK(TakeChunkedChunkedFSBRandomIndicesWithNulls)->Apply(TakeFSBSetArgs);
 BENCHMARK(TakeChunkedChunkedFSBMonotonicIndices)->Apply(TakeFSBSetArgs);
 BENCHMARK(TakeChunkedChunkedStringRandomIndicesNoNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedChunkedStringRandomIndicesWithNulls)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedChunkedStringFewRandomIndicesWithNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedChunkedStringMonotonicIndices)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedChunkedStringFewMonotonicIndices)->Apply(TakeSetArgs);
 
 // Chunked values x Flat indices
 BENCHMARK(TakeChunkedFlatInt64RandomIndicesNoNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedFlatInt64RandomIndicesWithNulls)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedFlatInt64FewRandomIndicesWithNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedFlatInt64MonotonicIndices)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedFlatInt64FewMonotonicIndices)->Apply(TakeSetArgs);
 
 }  // namespace compute
 }  // namespace arrow

From f078942ce2df68de8f48c3b4233132133601ca53 Mon Sep 17 00:00:00 2001
From: Adam Reeve <adreeve@gmail.com>
Date: Thu, 22 Aug 2024 02:59:04 +1200
Subject: [PATCH 053/157] GH-43141: [C++][Parquet] Replace use of int with
 int32_t in the internal Parquet encryption APIs (#43413)

### Rationale for this change

See #43141

### What changes are included in this PR?

* Changes uses of int to int32_t in the Encryptor and Decryptor APIs, except where interfacing with OpenSSL.
* Also change RandBytes to use size_t instead of int and check for overflow.
* Check the return code from OpenSSL's Rand_bytes in case there is a failure generating random bytes

### Are these changes tested?

Yes, this doesn't change behaviour and is covered by existing tests.

### Are there any user-facing changes?

No
* GitHub Issue: #43141

Authored-by: Adam Reeve <adreeve@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/parquet/column_reader.cc              |   4 +-
 cpp/src/parquet/encryption/crypto_factory.cc  |   6 +-
 .../parquet/encryption/encryption_internal.cc | 251 ++++++++++--------
 .../parquet/encryption/encryption_internal.h  |  46 ++--
 .../encryption/encryption_internal_nossl.cc   |  47 ++--
 .../encryption/encryption_internal_test.cc    |  22 +-
 .../parquet/encryption/file_key_wrapper.cc    |   4 +-
 .../encryption/internal_file_decryptor.cc     |  12 +-
 .../encryption/internal_file_decryptor.h      |   8 +-
 .../encryption/internal_file_encryptor.cc     |  10 +-
 .../encryption/internal_file_encryptor.h      |   6 +-
 .../encryption/key_toolkit_internal.cc        |   2 +-
 cpp/src/parquet/metadata.cc                   |   6 +-
 cpp/src/parquet/thrift_internal.h             |   2 +-
 14 files changed, 233 insertions(+), 193 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 05ee6a16c54..60a8a2176b0 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -468,8 +468,8 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
     // Advance the stream offset
     PARQUET_THROW_NOT_OK(stream_->Advance(header_size));
 
-    int compressed_len = current_page_header_.compressed_page_size;
-    int uncompressed_len = current_page_header_.uncompressed_page_size;
+    int32_t compressed_len = current_page_header_.compressed_page_size;
+    int32_t uncompressed_len = current_page_header_.uncompressed_page_size;
     if (compressed_len < 0 || uncompressed_len < 0) {
       throw ParquetException("Invalid page header");
     }
diff --git a/cpp/src/parquet/encryption/crypto_factory.cc b/cpp/src/parquet/encryption/crypto_factory.cc
index 72506bdc014..56069d55977 100644
--- a/cpp/src/parquet/encryption/crypto_factory.cc
+++ b/cpp/src/parquet/encryption/crypto_factory.cc
@@ -72,8 +72,7 @@ std::shared_ptr<FileEncryptionProperties> CryptoFactory::GetFileEncryptionProper
   int dek_length = dek_length_bits / 8;
 
   std::string footer_key(dek_length, '\0');
-  RandBytes(reinterpret_cast<uint8_t*>(&footer_key[0]),
-            static_cast<int>(footer_key.size()));
+  RandBytes(reinterpret_cast<uint8_t*>(footer_key.data()), footer_key.size());
 
   std::string footer_key_metadata =
       key_wrapper.GetEncryptionKeyMetadata(footer_key, footer_key_id, true);
@@ -148,8 +147,7 @@ ColumnPathToEncryptionPropertiesMap CryptoFactory::GetColumnEncryptionProperties
       }
 
       std::string column_key(dek_length, '\0');
-      RandBytes(reinterpret_cast<uint8_t*>(&column_key[0]),
-                static_cast<int>(column_key.size()));
+      RandBytes(reinterpret_cast<uint8_t*>(column_key.data()), column_key.size());
       std::string column_key_key_metadata =
           key_wrapper->GetEncryptionKeyMetadata(column_key, column_key_id, false);
 
diff --git a/cpp/src/parquet/encryption/encryption_internal.cc b/cpp/src/parquet/encryption/encryption_internal.cc
index 99d1707f4a8..a0d9367b619 100644
--- a/cpp/src/parquet/encryption/encryption_internal.cc
+++ b/cpp/src/parquet/encryption/encryption_internal.cc
@@ -18,6 +18,7 @@
 #include "parquet/encryption/encryption_internal.h"
 
 #include <openssl/aes.h>
+#include <openssl/err.h>
 #include <openssl/evp.h>
 #include <openssl/rand.h>
 
@@ -36,10 +37,10 @@ using parquet::ParquetException;
 
 namespace parquet::encryption {
 
-constexpr int kGcmMode = 0;
-constexpr int kCtrMode = 1;
-constexpr int kCtrIvLength = 16;
-constexpr int kBufferSizeLength = 4;
+constexpr int32_t kGcmMode = 0;
+constexpr int32_t kCtrMode = 1;
+constexpr int32_t kCtrIvLength = 16;
+constexpr int32_t kBufferSizeLength = 4;
 
 #define ENCRYPT_INIT(CTX, ALG)                                        \
   if (1 != EVP_EncryptInit_ex(CTX, ALG, nullptr, nullptr, nullptr)) { \
@@ -53,17 +54,17 @@ constexpr int kBufferSizeLength = 4;
 
 class AesEncryptor::AesEncryptorImpl {
  public:
-  explicit AesEncryptorImpl(ParquetCipher::type alg_id, int key_len, bool metadata,
+  explicit AesEncryptorImpl(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                             bool write_length);
 
   ~AesEncryptorImpl() { WipeOut(); }
 
-  int Encrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
-              span<const uint8_t> aad, span<uint8_t> ciphertext);
+  int32_t Encrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
+                  span<const uint8_t> aad, span<uint8_t> ciphertext);
 
-  int SignedFooterEncrypt(span<const uint8_t> footer, span<const uint8_t> key,
-                          span<const uint8_t> aad, span<const uint8_t> nonce,
-                          span<uint8_t> encrypted_footer);
+  int32_t SignedFooterEncrypt(span<const uint8_t> footer, span<const uint8_t> key,
+                              span<const uint8_t> aad, span<const uint8_t> nonce,
+                              span<uint8_t> encrypted_footer);
   void WipeOut() {
     if (nullptr != ctx_) {
       EVP_CIPHER_CTX_free(ctx_);
@@ -89,21 +90,22 @@ class AesEncryptor::AesEncryptorImpl {
 
  private:
   EVP_CIPHER_CTX* ctx_;
-  int aes_mode_;
-  int key_length_;
-  int ciphertext_size_delta_;
-  int length_buffer_length_;
+  int32_t aes_mode_;
+  int32_t key_length_;
+  int32_t ciphertext_size_delta_;
+  int32_t length_buffer_length_;
 
-  int GcmEncrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
-                 span<const uint8_t> nonce, span<const uint8_t> aad,
-                 span<uint8_t> ciphertext);
+  int32_t GcmEncrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
+                     span<const uint8_t> nonce, span<const uint8_t> aad,
+                     span<uint8_t> ciphertext);
 
-  int CtrEncrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
-                 span<const uint8_t> nonce, span<uint8_t> ciphertext);
+  int32_t CtrEncrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
+                     span<const uint8_t> nonce, span<uint8_t> ciphertext);
 };
 
-AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata, bool write_length) {
+AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata,
+                                                 bool write_length) {
   openssl::EnsureInitialized();
 
   ctx_ = nullptr;
@@ -151,11 +153,9 @@ AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id, int
   }
 }
 
-int AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(span<const uint8_t> footer,
-                                                        span<const uint8_t> key,
-                                                        span<const uint8_t> aad,
-                                                        span<const uint8_t> nonce,
-                                                        span<uint8_t> encrypted_footer) {
+int32_t AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(
+    span<const uint8_t> footer, span<const uint8_t> key, span<const uint8_t> aad,
+    span<const uint8_t> nonce, span<uint8_t> encrypted_footer) {
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -176,10 +176,10 @@ int AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(span<const uint8_t> foot
   return GcmEncrypt(footer, key, nonce, aad, encrypted_footer);
 }
 
-int AesEncryptor::AesEncryptorImpl::Encrypt(span<const uint8_t> plaintext,
-                                            span<const uint8_t> key,
-                                            span<const uint8_t> aad,
-                                            span<uint8_t> ciphertext) {
+int32_t AesEncryptor::AesEncryptorImpl::Encrypt(span<const uint8_t> plaintext,
+                                                span<const uint8_t> key,
+                                                span<const uint8_t> aad,
+                                                span<uint8_t> ciphertext) {
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -205,13 +205,13 @@ int AesEncryptor::AesEncryptorImpl::Encrypt(span<const uint8_t> plaintext,
   return CtrEncrypt(plaintext, key, nonce, ciphertext);
 }
 
-int AesEncryptor::AesEncryptorImpl::GcmEncrypt(span<const uint8_t> plaintext,
-                                               span<const uint8_t> key,
-                                               span<const uint8_t> nonce,
-                                               span<const uint8_t> aad,
-                                               span<uint8_t> ciphertext) {
+int32_t AesEncryptor::AesEncryptorImpl::GcmEncrypt(span<const uint8_t> plaintext,
+                                                   span<const uint8_t> key,
+                                                   span<const uint8_t> nonce,
+                                                   span<const uint8_t> aad,
+                                                   span<uint8_t> ciphertext) {
   int len;
-  int ciphertext_len;
+  int32_t ciphertext_len;
 
   std::array<uint8_t, kGcmTagLength> tag{};
 
@@ -227,12 +227,22 @@ int AesEncryptor::AesEncryptorImpl::GcmEncrypt(span<const uint8_t> plaintext,
   }
 
   // Setting additional authenticated data
+  if (aad.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    std::stringstream ss;
+    ss << "AAD size " << aad.size() << " overflows int";
+    throw ParquetException(ss.str());
+  }
   if ((!aad.empty()) && (1 != EVP_EncryptUpdate(ctx_, nullptr, &len, aad.data(),
                                                 static_cast<int>(aad.size())))) {
     throw ParquetException("Couldn't set AAD");
   }
 
   // Encryption
+  if (plaintext.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    std::stringstream ss;
+    ss << "Plaintext size " << plaintext.size() << " overflows int";
+    throw ParquetException(ss.str());
+  }
   if (1 !=
       EVP_EncryptUpdate(ctx_, ciphertext.data() + length_buffer_length_ + kNonceLength,
                         &len, plaintext.data(), static_cast<int>(plaintext.size()))) {
@@ -256,7 +266,7 @@ int AesEncryptor::AesEncryptorImpl::GcmEncrypt(span<const uint8_t> plaintext,
   }
 
   // Copying the buffer size, nonce and tag to ciphertext
-  int buffer_size = kNonceLength + ciphertext_len + kGcmTagLength;
+  int32_t buffer_size = kNonceLength + ciphertext_len + kGcmTagLength;
   if (length_buffer_length_ > 0) {
     ciphertext[3] = static_cast<uint8_t>(0xff & (buffer_size >> 24));
     ciphertext[2] = static_cast<uint8_t>(0xff & (buffer_size >> 16));
@@ -271,12 +281,12 @@ int AesEncryptor::AesEncryptorImpl::GcmEncrypt(span<const uint8_t> plaintext,
   return length_buffer_length_ + buffer_size;
 }
 
-int AesEncryptor::AesEncryptorImpl::CtrEncrypt(span<const uint8_t> plaintext,
-                                               span<const uint8_t> key,
-                                               span<const uint8_t> nonce,
-                                               span<uint8_t> ciphertext) {
+int32_t AesEncryptor::AesEncryptorImpl::CtrEncrypt(span<const uint8_t> plaintext,
+                                                   span<const uint8_t> key,
+                                                   span<const uint8_t> nonce,
+                                                   span<uint8_t> ciphertext) {
   int len;
-  int ciphertext_len;
+  int32_t ciphertext_len;
 
   if (nonce.size() != static_cast<size_t>(kNonceLength)) {
     std::stringstream ss;
@@ -298,6 +308,11 @@ int AesEncryptor::AesEncryptorImpl::CtrEncrypt(span<const uint8_t> plaintext,
   }
 
   // Encryption
+  if (plaintext.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    std::stringstream ss;
+    ss << "Plaintext size " << plaintext.size() << " overflows int";
+    throw ParquetException(ss.str());
+  }
   if (1 !=
       EVP_EncryptUpdate(ctx_, ciphertext.data() + length_buffer_length_ + kNonceLength,
                         &len, plaintext.data(), static_cast<int>(plaintext.size()))) {
@@ -316,7 +331,7 @@ int AesEncryptor::AesEncryptorImpl::CtrEncrypt(span<const uint8_t> plaintext,
   ciphertext_len += len;
 
   // Copying the buffer size and nonce to ciphertext
-  int buffer_size = kNonceLength + ciphertext_len;
+  int32_t buffer_size = kNonceLength + ciphertext_len;
   if (length_buffer_length_ > 0) {
     ciphertext[3] = static_cast<uint8_t>(0xff & (buffer_size >> 24));
     ciphertext[2] = static_cast<uint8_t>(0xff & (buffer_size >> 16));
@@ -331,9 +346,11 @@ int AesEncryptor::AesEncryptorImpl::CtrEncrypt(span<const uint8_t> plaintext,
 
 AesEncryptor::~AesEncryptor() {}
 
-int AesEncryptor::SignedFooterEncrypt(span<const uint8_t> footer, span<const uint8_t> key,
-                                      span<const uint8_t> aad, span<const uint8_t> nonce,
-                                      span<uint8_t> encrypted_footer) {
+int32_t AesEncryptor::SignedFooterEncrypt(span<const uint8_t> footer,
+                                          span<const uint8_t> key,
+                                          span<const uint8_t> aad,
+                                          span<const uint8_t> nonce,
+                                          span<uint8_t> encrypted_footer) {
   return impl_->SignedFooterEncrypt(footer, key, aad, nonce, encrypted_footer);
 }
 
@@ -343,25 +360,25 @@ int32_t AesEncryptor::CiphertextLength(int64_t plaintext_len) const {
   return impl_->CiphertextLength(plaintext_len);
 }
 
-int AesEncryptor::Encrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
-                          span<const uint8_t> aad, span<uint8_t> ciphertext) {
+int32_t AesEncryptor::Encrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
+                              span<const uint8_t> aad, span<uint8_t> ciphertext) {
   return impl_->Encrypt(plaintext, key, aad, ciphertext);
 }
 
-AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                            bool write_length)
     : impl_{std::unique_ptr<AesEncryptorImpl>(
           new AesEncryptorImpl(alg_id, key_len, metadata, write_length))} {}
 
 class AesDecryptor::AesDecryptorImpl {
  public:
-  explicit AesDecryptorImpl(ParquetCipher::type alg_id, int key_len, bool metadata,
+  explicit AesDecryptorImpl(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                             bool contains_length);
 
   ~AesDecryptorImpl() { WipeOut(); }
 
-  int Decrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
-              span<const uint8_t> aad, span<uint8_t> plaintext);
+  int32_t Decrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
+                  span<const uint8_t> aad, span<uint8_t> plaintext);
 
   void WipeOut() {
     if (nullptr != ctx_) {
@@ -370,7 +387,7 @@ class AesDecryptor::AesDecryptorImpl {
     }
   }
 
-  [[nodiscard]] int PlaintextLength(int ciphertext_len) const {
+  [[nodiscard]] int32_t PlaintextLength(int32_t ciphertext_len) const {
     if (ciphertext_len < ciphertext_size_delta_) {
       std::stringstream ss;
       ss << "Ciphertext length " << ciphertext_len << " is invalid, expected at least "
@@ -380,12 +397,13 @@ class AesDecryptor::AesDecryptorImpl {
     return ciphertext_len - ciphertext_size_delta_;
   }
 
-  [[nodiscard]] int CiphertextLength(int plaintext_len) const {
+  [[nodiscard]] int32_t CiphertextLength(int32_t plaintext_len) const {
     if (plaintext_len < 0) {
       std::stringstream ss;
       ss << "Negative plaintext length " << plaintext_len;
       throw ParquetException(ss.str());
-    } else if (plaintext_len > std::numeric_limits<int>::max() - ciphertext_size_delta_) {
+    } else if (plaintext_len >
+               std::numeric_limits<int32_t>::max() - ciphertext_size_delta_) {
       std::stringstream ss;
       ss << "Plaintext length " << plaintext_len << " plus ciphertext size delta "
          << ciphertext_size_delta_ << " overflows int32";
@@ -396,24 +414,24 @@ class AesDecryptor::AesDecryptorImpl {
 
  private:
   EVP_CIPHER_CTX* ctx_;
-  int aes_mode_;
-  int key_length_;
-  int ciphertext_size_delta_;
-  int length_buffer_length_;
+  int32_t aes_mode_;
+  int32_t key_length_;
+  int32_t ciphertext_size_delta_;
+  int32_t length_buffer_length_;
 
   /// Get the actual ciphertext length, inclusive of the length buffer length,
   /// and validate that the provided buffer size is large enough.
-  [[nodiscard]] int GetCiphertextLength(span<const uint8_t> ciphertext) const;
+  [[nodiscard]] int32_t GetCiphertextLength(span<const uint8_t> ciphertext) const;
 
-  int GcmDecrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
-                 span<const uint8_t> aad, span<uint8_t> plaintext);
+  int32_t GcmDecrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
+                     span<const uint8_t> aad, span<uint8_t> plaintext);
 
-  int CtrDecrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
-                 span<uint8_t> plaintext);
+  int32_t CtrDecrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
+                     span<uint8_t> plaintext);
 };
 
-int AesDecryptor::Decrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
-                          span<const uint8_t> aad, span<uint8_t> plaintext) {
+int32_t AesDecryptor::Decrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
+                              span<const uint8_t> aad, span<uint8_t> plaintext) {
   return impl_->Decrypt(ciphertext, key, aad, plaintext);
 }
 
@@ -421,8 +439,9 @@ void AesDecryptor::WipeOut() { impl_->WipeOut(); }
 
 AesDecryptor::~AesDecryptor() {}
 
-AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata, bool contains_length) {
+AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata,
+                                                 bool contains_length) {
   openssl::EnsureInitialized();
 
   ctx_ = nullptr;
@@ -469,13 +488,14 @@ AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int
   }
 }
 
-std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata) {
+std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata) {
   return Make(alg_id, key_len, metadata, true /*write_length*/);
 }
 
-std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata, bool write_length) {
+std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata,
+                                                 bool write_length) {
   if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) {
     std::stringstream ss;
     ss << "Crypto algorithm " << alg_id << " is not supported";
@@ -485,13 +505,13 @@ std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id, int
   return std::make_unique<AesEncryptor>(alg_id, key_len, metadata, write_length);
 }
 
-AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                            bool contains_length)
     : impl_{std::unique_ptr<AesDecryptorImpl>(
           new AesDecryptorImpl(alg_id, key_len, metadata, contains_length))} {}
 
 std::shared_ptr<AesDecryptor> AesDecryptor::Make(
-    ParquetCipher::type alg_id, int key_len, bool metadata,
+    ParquetCipher::type alg_id, int32_t key_len, bool metadata,
     std::vector<std::weak_ptr<AesDecryptor>>* all_decryptors) {
   if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) {
     std::stringstream ss;
@@ -506,15 +526,15 @@ std::shared_ptr<AesDecryptor> AesDecryptor::Make(
   return decryptor;
 }
 
-int AesDecryptor::PlaintextLength(int ciphertext_len) const {
+int32_t AesDecryptor::PlaintextLength(int32_t ciphertext_len) const {
   return impl_->PlaintextLength(ciphertext_len);
 }
 
-int AesDecryptor::CiphertextLength(int plaintext_len) const {
+int32_t AesDecryptor::CiphertextLength(int32_t plaintext_len) const {
   return impl_->CiphertextLength(plaintext_len);
 }
 
-int AesDecryptor::AesDecryptorImpl::GetCiphertextLength(
+int32_t AesDecryptor::AesDecryptorImpl::GetCiphertextLength(
     span<const uint8_t> ciphertext) const {
   if (length_buffer_length_ > 0) {
     // Note: length_buffer_length_ must be either 0 or kBufferSizeLength
@@ -533,10 +553,11 @@ int AesDecryptor::AesDecryptorImpl::GetCiphertextLength(
                                       (static_cast<uint32_t>(ciphertext[0]));
 
     if (written_ciphertext_len >
-        static_cast<uint32_t>(std::numeric_limits<int>::max() - length_buffer_length_)) {
+        static_cast<uint32_t>(std::numeric_limits<int32_t>::max() -
+                              length_buffer_length_)) {
       std::stringstream ss;
       ss << "Written ciphertext length " << written_ciphertext_len
-         << " plus length buffer length " << length_buffer_length_ << " overflows int";
+         << " plus length buffer length " << length_buffer_length_ << " overflows int32";
       throw ParquetException(ss.str());
     } else if (ciphertext.size() <
                static_cast<size_t>(written_ciphertext_len) + length_buffer_length_) {
@@ -548,28 +569,28 @@ int AesDecryptor::AesDecryptorImpl::GetCiphertextLength(
       throw ParquetException(ss.str());
     }
 
-    return static_cast<int>(written_ciphertext_len) + length_buffer_length_;
+    return static_cast<int32_t>(written_ciphertext_len) + length_buffer_length_;
   } else {
-    if (ciphertext.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    if (ciphertext.size() > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
       std::stringstream ss;
-      ss << "Ciphertext buffer length " << ciphertext.size() << " overflows int";
+      ss << "Ciphertext buffer length " << ciphertext.size() << " overflows int32";
       throw ParquetException(ss.str());
     }
-    return static_cast<int>(ciphertext.size());
+    return static_cast<int32_t>(ciphertext.size());
   }
 }
 
-int AesDecryptor::AesDecryptorImpl::GcmDecrypt(span<const uint8_t> ciphertext,
-                                               span<const uint8_t> key,
-                                               span<const uint8_t> aad,
-                                               span<uint8_t> plaintext) {
+int32_t AesDecryptor::AesDecryptorImpl::GcmDecrypt(span<const uint8_t> ciphertext,
+                                                   span<const uint8_t> key,
+                                                   span<const uint8_t> aad,
+                                                   span<uint8_t> plaintext) {
   int len;
-  int plaintext_len;
+  int32_t plaintext_len;
 
   std::array<uint8_t, kGcmTagLength> tag{};
   std::array<uint8_t, kNonceLength> nonce{};
 
-  int ciphertext_len = GetCiphertextLength(ciphertext);
+  int32_t ciphertext_len = GetCiphertextLength(ciphertext);
 
   if (plaintext.size() < static_cast<size_t>(ciphertext_len) - ciphertext_size_delta_) {
     std::stringstream ss;
@@ -597,16 +618,22 @@ int AesDecryptor::AesDecryptorImpl::GcmDecrypt(span<const uint8_t> ciphertext,
   }
 
   // Setting additional authenticated data
+  if (aad.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    std::stringstream ss;
+    ss << "AAD size " << aad.size() << " overflows int";
+    throw ParquetException(ss.str());
+  }
   if ((!aad.empty()) && (1 != EVP_DecryptUpdate(ctx_, nullptr, &len, aad.data(),
                                                 static_cast<int>(aad.size())))) {
     throw ParquetException("Couldn't set AAD");
   }
 
   // Decryption
-  if (!EVP_DecryptUpdate(
-          ctx_, plaintext.data(), &len,
-          ciphertext.data() + length_buffer_length_ + kNonceLength,
-          ciphertext_len - length_buffer_length_ - kNonceLength - kGcmTagLength)) {
+  int decryption_length =
+      ciphertext_len - length_buffer_length_ - kNonceLength - kGcmTagLength;
+  if (!EVP_DecryptUpdate(ctx_, plaintext.data(), &len,
+                         ciphertext.data() + length_buffer_length_ + kNonceLength,
+                         decryption_length)) {
     throw ParquetException("Failed decryption update");
   }
 
@@ -626,15 +653,15 @@ int AesDecryptor::AesDecryptorImpl::GcmDecrypt(span<const uint8_t> ciphertext,
   return plaintext_len;
 }
 
-int AesDecryptor::AesDecryptorImpl::CtrDecrypt(span<const uint8_t> ciphertext,
-                                               span<const uint8_t> key,
-                                               span<uint8_t> plaintext) {
+int32_t AesDecryptor::AesDecryptorImpl::CtrDecrypt(span<const uint8_t> ciphertext,
+                                                   span<const uint8_t> key,
+                                                   span<uint8_t> plaintext) {
   int len;
-  int plaintext_len;
+  int32_t plaintext_len;
 
   std::array<uint8_t, kCtrIvLength> iv{};
 
-  int ciphertext_len = GetCiphertextLength(ciphertext);
+  int32_t ciphertext_len = GetCiphertextLength(ciphertext);
 
   if (plaintext.size() < static_cast<size_t>(ciphertext_len) - ciphertext_size_delta_) {
     std::stringstream ss;
@@ -665,9 +692,10 @@ int AesDecryptor::AesDecryptorImpl::CtrDecrypt(span<const uint8_t> ciphertext,
   }
 
   // Decryption
+  int decryption_length = ciphertext_len - length_buffer_length_ - kNonceLength;
   if (!EVP_DecryptUpdate(ctx_, plaintext.data(), &len,
                          ciphertext.data() + length_buffer_length_ + kNonceLength,
-                         ciphertext_len - length_buffer_length_ - kNonceLength)) {
+                         decryption_length)) {
     throw ParquetException("Failed decryption update");
   }
 
@@ -682,10 +710,10 @@ int AesDecryptor::AesDecryptorImpl::CtrDecrypt(span<const uint8_t> ciphertext,
   return plaintext_len;
 }
 
-int AesDecryptor::AesDecryptorImpl::Decrypt(span<const uint8_t> ciphertext,
-                                            span<const uint8_t> key,
-                                            span<const uint8_t> aad,
-                                            span<uint8_t> plaintext) {
+int32_t AesDecryptor::AesDecryptorImpl::Decrypt(span<const uint8_t> ciphertext,
+                                                span<const uint8_t> key,
+                                                span<const uint8_t> aad,
+                                                span<uint8_t> plaintext) {
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -758,9 +786,22 @@ void QuickUpdatePageAad(int32_t new_page_ordinal, std::string* AAD) {
   std::memcpy(AAD->data() + AAD->length() - 2, page_ordinal_bytes.data(), 2);
 }
 
-void RandBytes(unsigned char* buf, int num) {
+void RandBytes(unsigned char* buf, size_t num) {
+  if (num > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    std::stringstream ss;
+    ss << "Length " << num << " for RandBytes overflows int";
+    throw ParquetException(ss.str());
+  }
   openssl::EnsureInitialized();
-  RAND_bytes(buf, num);
+  int status = RAND_bytes(buf, static_cast<int>(num));
+  if (status != 1) {
+    const auto error_code = ERR_get_error();
+    char buffer[256];
+    ERR_error_string_n(error_code, buffer, sizeof(buffer));
+    std::stringstream ss;
+    ss << "Failed to generate random bytes: " << buffer;
+    throw ParquetException(ss.str());
+  }
 }
 
 void EnsureBackendInitialized() { openssl::EnsureInitialized(); }
diff --git a/cpp/src/parquet/encryption/encryption_internal.h b/cpp/src/parquet/encryption/encryption_internal.h
index c874b137ad1..d79ff56ad49 100644
--- a/cpp/src/parquet/encryption/encryption_internal.h
+++ b/cpp/src/parquet/encryption/encryption_internal.h
@@ -29,8 +29,8 @@ using parquet::ParquetCipher;
 
 namespace parquet::encryption {
 
-constexpr int kGcmTagLength = 16;
-constexpr int kNonceLength = 12;
+constexpr int32_t kGcmTagLength = 16;
+constexpr int32_t kNonceLength = 12;
 
 // Module types
 constexpr int8_t kFooter = 0;
@@ -49,13 +49,13 @@ class PARQUET_EXPORT AesEncryptor {
  public:
   /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
   /// If write_length is true, prepend ciphertext length to the ciphertext
-  explicit AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+  explicit AesEncryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                         bool write_length = true);
 
-  static std::unique_ptr<AesEncryptor> Make(ParquetCipher::type alg_id, int key_len,
+  static std::unique_ptr<AesEncryptor> Make(ParquetCipher::type alg_id, int32_t key_len,
                                             bool metadata);
 
-  static std::unique_ptr<AesEncryptor> Make(ParquetCipher::type alg_id, int key_len,
+  static std::unique_ptr<AesEncryptor> Make(ParquetCipher::type alg_id, int32_t key_len,
                                             bool metadata, bool write_length);
 
   ~AesEncryptor();
@@ -65,17 +65,17 @@ class PARQUET_EXPORT AesEncryptor {
 
   /// Encrypts plaintext with the key and aad. Key length is passed only for validation.
   /// If different from value in constructor, exception will be thrown.
-  int Encrypt(::arrow::util::span<const uint8_t> plaintext,
-              ::arrow::util::span<const uint8_t> key,
-              ::arrow::util::span<const uint8_t> aad,
-              ::arrow::util::span<uint8_t> ciphertext);
+  int32_t Encrypt(::arrow::util::span<const uint8_t> plaintext,
+                  ::arrow::util::span<const uint8_t> key,
+                  ::arrow::util::span<const uint8_t> aad,
+                  ::arrow::util::span<uint8_t> ciphertext);
 
   /// Encrypts plaintext footer, in order to compute footer signature (tag).
-  int SignedFooterEncrypt(::arrow::util::span<const uint8_t> footer,
-                          ::arrow::util::span<const uint8_t> key,
-                          ::arrow::util::span<const uint8_t> aad,
-                          ::arrow::util::span<const uint8_t> nonce,
-                          ::arrow::util::span<uint8_t> encrypted_footer);
+  int32_t SignedFooterEncrypt(::arrow::util::span<const uint8_t> footer,
+                              ::arrow::util::span<const uint8_t> key,
+                              ::arrow::util::span<const uint8_t> aad,
+                              ::arrow::util::span<const uint8_t> nonce,
+                              ::arrow::util::span<uint8_t> encrypted_footer);
 
   void WipeOut();
 
@@ -90,7 +90,7 @@ class PARQUET_EXPORT AesDecryptor {
  public:
   /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
   /// If contains_length is true, expect ciphertext length prepended to the ciphertext
-  explicit AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+  explicit AesDecryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                         bool contains_length = true);
 
   /// \brief Factory function to create an AesDecryptor
@@ -102,26 +102,26 @@ class PARQUET_EXPORT AesDecryptor {
   /// out when decryption is finished
   /// \return shared pointer to a new AesDecryptor
   static std::shared_ptr<AesDecryptor> Make(
-      ParquetCipher::type alg_id, int key_len, bool metadata,
+      ParquetCipher::type alg_id, int32_t key_len, bool metadata,
       std::vector<std::weak_ptr<AesDecryptor>>* all_decryptors);
 
   ~AesDecryptor();
   void WipeOut();
 
   /// The size of the plaintext, for this cipher and the specified ciphertext length.
-  [[nodiscard]] int PlaintextLength(int ciphertext_len) const;
+  [[nodiscard]] int32_t PlaintextLength(int32_t ciphertext_len) const;
 
   /// The size of the ciphertext, for this cipher and the specified plaintext length.
-  [[nodiscard]] int CiphertextLength(int plaintext_len) const;
+  [[nodiscard]] int32_t CiphertextLength(int32_t plaintext_len) const;
 
   /// Decrypts ciphertext with the key and aad. Key length is passed only for
   /// validation. If different from value in constructor, exception will be thrown.
   /// The caller is responsible for ensuring that the plaintext buffer is at least as
   /// large as PlaintextLength(ciphertext_len).
-  int Decrypt(::arrow::util::span<const uint8_t> ciphertext,
-              ::arrow::util::span<const uint8_t> key,
-              ::arrow::util::span<const uint8_t> aad,
-              ::arrow::util::span<uint8_t> plaintext);
+  int32_t Decrypt(::arrow::util::span<const uint8_t> ciphertext,
+                  ::arrow::util::span<const uint8_t> key,
+                  ::arrow::util::span<const uint8_t> aad,
+                  ::arrow::util::span<uint8_t> plaintext);
 
  private:
   // PIMPL Idiom
@@ -139,7 +139,7 @@ std::string CreateFooterAad(const std::string& aad_prefix_bytes);
 void QuickUpdatePageAad(int32_t new_page_ordinal, std::string* AAD);
 
 // Wraps OpenSSL RAND_bytes function
-void RandBytes(unsigned char* buf, int num);
+void RandBytes(unsigned char* buf, size_t num);
 
 // Ensure OpenSSL is initialized.
 //
diff --git a/cpp/src/parquet/encryption/encryption_internal_nossl.cc b/cpp/src/parquet/encryption/encryption_internal_nossl.cc
index 2cce83915d7..2a8162ed396 100644
--- a/cpp/src/parquet/encryption/encryption_internal_nossl.cc
+++ b/cpp/src/parquet/encryption/encryption_internal_nossl.cc
@@ -29,11 +29,11 @@ class AesEncryptor::AesEncryptorImpl {};
 
 AesEncryptor::~AesEncryptor() {}
 
-int AesEncryptor::SignedFooterEncrypt(::arrow::util::span<const uint8_t> footer,
-                                      ::arrow::util::span<const uint8_t> key,
-                                      ::arrow::util::span<const uint8_t> aad,
-                                      ::arrow::util::span<const uint8_t> nonce,
-                                      ::arrow::util::span<uint8_t> encrypted_footer) {
+int32_t AesEncryptor::SignedFooterEncrypt(::arrow::util::span<const uint8_t> footer,
+                                          ::arrow::util::span<const uint8_t> key,
+                                          ::arrow::util::span<const uint8_t> aad,
+                                          ::arrow::util::span<const uint8_t> nonce,
+                                          ::arrow::util::span<uint8_t> encrypted_footer) {
   ThrowOpenSSLRequiredException();
   return -1;
 }
@@ -45,25 +45,25 @@ int32_t AesEncryptor::CiphertextLength(int64_t plaintext_len) const {
   return -1;
 }
 
-int AesEncryptor::Encrypt(::arrow::util::span<const uint8_t> plaintext,
-                          ::arrow::util::span<const uint8_t> key,
-                          ::arrow::util::span<const uint8_t> aad,
-                          ::arrow::util::span<uint8_t> ciphertext) {
+int32_t AesEncryptor::Encrypt(::arrow::util::span<const uint8_t> plaintext,
+                              ::arrow::util::span<const uint8_t> key,
+                              ::arrow::util::span<const uint8_t> aad,
+                              ::arrow::util::span<uint8_t> ciphertext) {
   ThrowOpenSSLRequiredException();
   return -1;
 }
 
-AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                            bool write_length) {
   ThrowOpenSSLRequiredException();
 }
 
 class AesDecryptor::AesDecryptorImpl {};
 
-int AesDecryptor::Decrypt(::arrow::util::span<const uint8_t> ciphertext,
-                          ::arrow::util::span<const uint8_t> key,
-                          ::arrow::util::span<const uint8_t> aad,
-                          ::arrow::util::span<uint8_t> plaintext) {
+int32_t AesDecryptor::Decrypt(::arrow::util::span<const uint8_t> ciphertext,
+                              ::arrow::util::span<const uint8_t> key,
+                              ::arrow::util::span<const uint8_t> aad,
+                              ::arrow::util::span<uint8_t> plaintext) {
   ThrowOpenSSLRequiredException();
   return -1;
 }
@@ -72,36 +72,37 @@ void AesDecryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
 
 AesDecryptor::~AesDecryptor() {}
 
-std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata) {
+std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata) {
   ThrowOpenSSLRequiredException();
   return NULLPTR;
 }
 
-std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata, bool write_length) {
+std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata,
+                                                 bool write_length) {
   ThrowOpenSSLRequiredException();
   return NULLPTR;
 }
 
-AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                            bool contains_length) {
   ThrowOpenSSLRequiredException();
 }
 
 std::shared_ptr<AesDecryptor> AesDecryptor::Make(
-    ParquetCipher::type alg_id, int key_len, bool metadata,
+    ParquetCipher::type alg_id, int32_t key_len, bool metadata,
     std::vector<std::weak_ptr<AesDecryptor>>* all_decryptors) {
   ThrowOpenSSLRequiredException();
   return NULLPTR;
 }
 
-int AesDecryptor::PlaintextLength(int ciphertext_len) const {
+int32_t AesDecryptor::PlaintextLength(int32_t ciphertext_len) const {
   ThrowOpenSSLRequiredException();
   return -1;
 }
 
-int AesDecryptor::CiphertextLength(int plaintext_len) const {
+int32_t AesDecryptor::CiphertextLength(int32_t plaintext_len) const {
   ThrowOpenSSLRequiredException();
   return -1;
 }
@@ -122,7 +123,7 @@ void QuickUpdatePageAad(int32_t new_page_ordinal, std::string* AAD) {
   ThrowOpenSSLRequiredException();
 }
 
-void RandBytes(unsigned char* buf, int num) { ThrowOpenSSLRequiredException(); }
+void RandBytes(unsigned char* buf, size_t num) { ThrowOpenSSLRequiredException(); }
 
 void EnsureBackendInitialized() {}
 
diff --git a/cpp/src/parquet/encryption/encryption_internal_test.cc b/cpp/src/parquet/encryption/encryption_internal_test.cc
index 22e14663ea8..bf6607e3287 100644
--- a/cpp/src/parquet/encryption/encryption_internal_test.cc
+++ b/cpp/src/parquet/encryption/encryption_internal_test.cc
@@ -41,22 +41,22 @@ class TestAesEncryption : public ::testing::Test {
         encryptor.CiphertextLength(static_cast<int64_t>(plain_text_.size()));
     std::vector<uint8_t> ciphertext(expected_ciphertext_len, '\0');
 
-    int ciphertext_length = encryptor.Encrypt(str2span(plain_text_), str2span(key_),
-                                              str2span(aad_), ciphertext);
+    int32_t ciphertext_length = encryptor.Encrypt(str2span(plain_text_), str2span(key_),
+                                                  str2span(aad_), ciphertext);
 
     ASSERT_EQ(ciphertext_length, expected_ciphertext_len);
 
     AesDecryptor decryptor(cipher_type, key_length_, metadata, write_length);
 
-    int expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
+    int32_t expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
     std::vector<uint8_t> decrypted_text(expected_plaintext_length, '\0');
 
-    int plaintext_length =
+    int32_t plaintext_length =
         decryptor.Decrypt(ciphertext, str2span(key_), str2span(aad_), decrypted_text);
 
     std::string decrypted_text_str(decrypted_text.begin(), decrypted_text.end());
 
-    ASSERT_EQ(plaintext_length, static_cast<int>(plain_text_.size()));
+    ASSERT_EQ(plaintext_length, static_cast<int32_t>(plain_text_.size()));
     ASSERT_EQ(plaintext_length, expected_plaintext_length);
     ASSERT_EQ(decrypted_text_str, plain_text_);
   }
@@ -68,10 +68,10 @@ class TestAesEncryption : public ::testing::Test {
     AesDecryptor decryptor(cipher_type, key_length_, metadata, write_length);
 
     // Create ciphertext of all zeros, so the ciphertext length will be read as zero
-    const int ciphertext_length = 100;
+    constexpr int32_t ciphertext_length = 100;
     std::vector<uint8_t> ciphertext(ciphertext_length, '\0');
 
-    int expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
+    int32_t expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
     std::vector<uint8_t> decrypted_text(expected_plaintext_length, '\0');
 
     EXPECT_THROW(
@@ -89,12 +89,12 @@ class TestAesEncryption : public ::testing::Test {
         encryptor.CiphertextLength(static_cast<int64_t>(plain_text_.size()));
     std::vector<uint8_t> ciphertext(expected_ciphertext_len, '\0');
 
-    int ciphertext_length = encryptor.Encrypt(str2span(plain_text_), str2span(key_),
-                                              str2span(aad_), ciphertext);
+    int32_t ciphertext_length = encryptor.Encrypt(str2span(plain_text_), str2span(key_),
+                                                  str2span(aad_), ciphertext);
 
     AesDecryptor decryptor(cipher_type, key_length_, metadata, write_length);
 
-    int expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
+    int32_t expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
     std::vector<uint8_t> decrypted_text(expected_plaintext_length, '\0');
 
     ::arrow::util::span<uint8_t> truncated_ciphertext(ciphertext.data(),
@@ -105,7 +105,7 @@ class TestAesEncryption : public ::testing::Test {
   }
 
  private:
-  int key_length_ = 0;
+  int32_t key_length_ = 0;
   std::string key_;
   std::string aad_;
   std::string plain_text_;
diff --git a/cpp/src/parquet/encryption/file_key_wrapper.cc b/cpp/src/parquet/encryption/file_key_wrapper.cc
index 032ae45821a..8ce563e60d7 100644
--- a/cpp/src/parquet/encryption/file_key_wrapper.cc
+++ b/cpp/src/parquet/encryption/file_key_wrapper.cc
@@ -112,10 +112,10 @@ std::string FileKeyWrapper::GetEncryptionKeyMetadata(const std::string& data_key
 KeyEncryptionKey FileKeyWrapper::CreateKeyEncryptionKey(
     const std::string& master_key_id) {
   std::string kek_bytes(kKeyEncryptionKeyLength, '\0');
-  RandBytes(reinterpret_cast<uint8_t*>(&kek_bytes[0]), kKeyEncryptionKeyLength);
+  RandBytes(reinterpret_cast<uint8_t*>(kek_bytes.data()), kKeyEncryptionKeyLength);
 
   std::string kek_id(kKeyEncryptionKeyIdLength, '\0');
-  RandBytes(reinterpret_cast<uint8_t*>(&kek_id[0]), kKeyEncryptionKeyIdLength);
+  RandBytes(reinterpret_cast<uint8_t*>(kek_id.data()), kKeyEncryptionKeyIdLength);
 
   // Encrypt KEK with Master key
   std::string encoded_wrapped_kek = kms_client_->WrapKey(kek_bytes, master_key_id);
diff --git a/cpp/src/parquet/encryption/internal_file_decryptor.cc b/cpp/src/parquet/encryption/internal_file_decryptor.cc
index fae5ce1f7a8..53a2f8c0216 100644
--- a/cpp/src/parquet/encryption/internal_file_decryptor.cc
+++ b/cpp/src/parquet/encryption/internal_file_decryptor.cc
@@ -33,16 +33,16 @@ Decryptor::Decryptor(std::shared_ptr<encryption::AesDecryptor> aes_decryptor,
       aad_(aad),
       pool_(pool) {}
 
-int Decryptor::PlaintextLength(int ciphertext_len) const {
+int32_t Decryptor::PlaintextLength(int32_t ciphertext_len) const {
   return aes_decryptor_->PlaintextLength(ciphertext_len);
 }
 
-int Decryptor::CiphertextLength(int plaintext_len) const {
+int32_t Decryptor::CiphertextLength(int32_t plaintext_len) const {
   return aes_decryptor_->CiphertextLength(plaintext_len);
 }
 
-int Decryptor::Decrypt(::arrow::util::span<const uint8_t> ciphertext,
-                       ::arrow::util::span<uint8_t> plaintext) {
+int32_t Decryptor::Decrypt(::arrow::util::span<const uint8_t> ciphertext,
+                           ::arrow::util::span<uint8_t> plaintext) {
   return aes_decryptor_->Decrypt(ciphertext, str2span(key_), str2span(aad_), plaintext);
 }
 
@@ -143,7 +143,7 @@ std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor(
 
   // Create both data and metadata decryptors to avoid redundant retrieval of key
   // from the key_retriever.
-  int key_len = static_cast<int>(footer_key.size());
+  auto key_len = static_cast<int32_t>(footer_key.size());
   std::shared_ptr<encryption::AesDecryptor> aes_metadata_decryptor;
   std::shared_ptr<encryption::AesDecryptor> aes_data_decryptor;
 
@@ -197,7 +197,7 @@ std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDecryptor(
     throw HiddenColumnException("HiddenColumnException, path=" + column_path);
   }
 
-  int key_len = static_cast<int>(column_key.size());
+  auto key_len = static_cast<int32_t>(column_key.size());
   std::lock_guard<std::mutex> lock(mutex_);
   auto aes_decryptor =
       encryption::AesDecryptor::Make(algorithm_, key_len, metadata, &all_decryptors_);
diff --git a/cpp/src/parquet/encryption/internal_file_decryptor.h b/cpp/src/parquet/encryption/internal_file_decryptor.h
index 8af3587acf8..08423de7fe9 100644
--- a/cpp/src/parquet/encryption/internal_file_decryptor.h
+++ b/cpp/src/parquet/encryption/internal_file_decryptor.h
@@ -45,10 +45,10 @@ class PARQUET_EXPORT Decryptor {
   void UpdateAad(const std::string& aad) { aad_ = aad; }
   ::arrow::MemoryPool* pool() { return pool_; }
 
-  [[nodiscard]] int PlaintextLength(int ciphertext_len) const;
-  [[nodiscard]] int CiphertextLength(int plaintext_len) const;
-  int Decrypt(::arrow::util::span<const uint8_t> ciphertext,
-              ::arrow::util::span<uint8_t> plaintext);
+  [[nodiscard]] int32_t PlaintextLength(int32_t ciphertext_len) const;
+  [[nodiscard]] int32_t CiphertextLength(int32_t plaintext_len) const;
+  int32_t Decrypt(::arrow::util::span<const uint8_t> ciphertext,
+                  ::arrow::util::span<uint8_t> plaintext);
 
  private:
   std::shared_ptr<encryption::AesDecryptor> aes_decryptor_;
diff --git a/cpp/src/parquet/encryption/internal_file_encryptor.cc b/cpp/src/parquet/encryption/internal_file_encryptor.cc
index 285c2100be8..94094e6aca2 100644
--- a/cpp/src/parquet/encryption/internal_file_encryptor.cc
+++ b/cpp/src/parquet/encryption/internal_file_encryptor.cc
@@ -35,8 +35,8 @@ int32_t Encryptor::CiphertextLength(int64_t plaintext_len) const {
   return aes_encryptor_->CiphertextLength(plaintext_len);
 }
 
-int Encryptor::Encrypt(::arrow::util::span<const uint8_t> plaintext,
-                       ::arrow::util::span<uint8_t> ciphertext) {
+int32_t Encryptor::Encrypt(::arrow::util::span<const uint8_t> plaintext,
+                           ::arrow::util::span<uint8_t> ciphertext) {
   return aes_encryptor_->Encrypt(plaintext, str2span(key_), str2span(aad_), ciphertext);
 }
 
@@ -143,7 +143,7 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor(
   return encryptor;
 }
 
-int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) const {
+int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int32_t key_len) const {
   if (key_len == 16)
     return 0;
   else if (key_len == 24)
@@ -155,7 +155,7 @@ int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) const {
 
 encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor(
     ParquetCipher::type algorithm, size_t key_size) {
-  int key_len = static_cast<int>(key_size);
+  auto key_len = static_cast<int32_t>(key_size);
   int index = MapKeyLenToEncryptorArrayIndex(key_len);
   if (meta_encryptor_[index] == nullptr) {
     meta_encryptor_[index] = encryption::AesEncryptor::Make(algorithm, key_len, true);
@@ -165,7 +165,7 @@ encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor(
 
 encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor(
     ParquetCipher::type algorithm, size_t key_size) {
-  int key_len = static_cast<int>(key_size);
+  auto key_len = static_cast<int32_t>(key_size);
   int index = MapKeyLenToEncryptorArrayIndex(key_len);
   if (data_encryptor_[index] == nullptr) {
     data_encryptor_[index] = encryption::AesEncryptor::Make(algorithm, key_len, false);
diff --git a/cpp/src/parquet/encryption/internal_file_encryptor.h b/cpp/src/parquet/encryption/internal_file_encryptor.h
index 91b6e9fe5aa..5a3d743ce53 100644
--- a/cpp/src/parquet/encryption/internal_file_encryptor.h
+++ b/cpp/src/parquet/encryption/internal_file_encryptor.h
@@ -45,8 +45,8 @@ class PARQUET_EXPORT Encryptor {
 
   [[nodiscard]] int32_t CiphertextLength(int64_t plaintext_len) const;
 
-  int Encrypt(::arrow::util::span<const uint8_t> plaintext,
-              ::arrow::util::span<uint8_t> ciphertext);
+  int32_t Encrypt(::arrow::util::span<const uint8_t> plaintext,
+                  ::arrow::util::span<uint8_t> ciphertext);
 
   bool EncryptColumnMetaData(
       bool encrypted_footer,
@@ -103,7 +103,7 @@ class InternalFileEncryptor {
   encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm,
                                                 size_t key_len);
 
-  int MapKeyLenToEncryptorArrayIndex(int key_len) const;
+  int MapKeyLenToEncryptorArrayIndex(int32_t key_len) const;
 };
 
 }  // namespace parquet
diff --git a/cpp/src/parquet/encryption/key_toolkit_internal.cc b/cpp/src/parquet/encryption/key_toolkit_internal.cc
index 5d7925aa031..89a52a2bcd6 100644
--- a/cpp/src/parquet/encryption/key_toolkit_internal.cc
+++ b/cpp/src/parquet/encryption/key_toolkit_internal.cc
@@ -53,7 +53,7 @@ std::string DecryptKeyLocally(const std::string& encoded_encrypted_key,
                              static_cast<int>(master_key.size()), false,
                              false /*contains_length*/);
 
-  int decrypted_key_len =
+  int32_t decrypted_key_len =
       key_decryptor.PlaintextLength(static_cast<int>(encrypted_key.size()));
   std::string decrypted_key(decrypted_key_len, '\0');
   ::arrow::util::span<uint8_t> decrypted_key_span(
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 4f2aa6e3732..423154f8641 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -751,7 +751,7 @@ class FileMetaData::FileMetaDataImpl {
 
     std::shared_ptr<Buffer> encrypted_buffer = AllocateBuffer(
         file_decryptor_->pool(), aes_encryptor->CiphertextLength(serialized_len));
-    uint32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
+    int32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
         serialized_data_span, str2span(key), str2span(aad), nonce,
         encrypted_buffer->mutable_span_as<uint8_t>());
     // Delete AES encryptor object. It was created only to verify the footer signature.
@@ -799,7 +799,7 @@ class FileMetaData::FileMetaDataImpl {
 
       // encrypt the footer key
       std::vector<uint8_t> encrypted_data(encryptor->CiphertextLength(serialized_len));
-      int encrypted_len = encryptor->Encrypt(serialized_data_span, encrypted_data);
+      int32_t encrypted_len = encryptor->Encrypt(serialized_data_span, encrypted_data);
 
       // write unencrypted footer
       PARQUET_THROW_NOT_OK(dst->Write(serialized_data, serialized_len));
@@ -1672,7 +1672,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
                                                                 serialized_len);
 
         std::vector<uint8_t> encrypted_data(encryptor->CiphertextLength(serialized_len));
-        int encrypted_len = encryptor->Encrypt(serialized_data_span, encrypted_data);
+        int32_t encrypted_len = encryptor->Encrypt(serialized_data_span, encrypted_data);
 
         const char* temp =
             const_cast<const char*>(reinterpret_cast<char*>(encrypted_data.data()));
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index b21b0e07afb..e7bfd434c81 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -530,7 +530,7 @@ class ThriftSerializer {
     auto cipher_buffer =
         AllocateBuffer(encryptor->pool(), encryptor->CiphertextLength(out_length));
     ::arrow::util::span<const uint8_t> out_span(out_buffer, out_length);
-    int cipher_buffer_len =
+    int32_t cipher_buffer_len =
         encryptor->Encrypt(out_span, cipher_buffer->mutable_span_as<uint8_t>());
 
     PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len));

From 6a1d0520974355a749557c993841732d4fcf894c Mon Sep 17 00:00:00 2001
From: Devin Smith <devinsmith@deephaven.io>
Date: Wed, 21 Aug 2024 18:12:45 -0700
Subject: [PATCH 054/157] GH-43717: [Java][FlightSQL] Add all ActionTypes to
 FlightSqlUtils.FLIGHT_SQL_ACTIONS (#43718)

This adds all of the FlightSQL ActionTypes to FlightSqlUtils.FLIGHT_SQL_ACTIONS

* GitHub Issue: #43717

Authored-by: Devin Smith <devinsmith@deephaven.io>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../org/apache/arrow/flight/sql/FlightSqlUtils.java    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlUtils.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlUtils.java
index 9bb95047691..9e13e57d66c 100644
--- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlUtils.java
+++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlUtils.java
@@ -82,7 +82,15 @@ public final class FlightSqlUtils {
               + "Response Message: N/A");
 
   public static final List<ActionType> FLIGHT_SQL_ACTIONS =
-      ImmutableList.of(FLIGHT_SQL_CREATE_PREPARED_STATEMENT, FLIGHT_SQL_CLOSE_PREPARED_STATEMENT);
+      ImmutableList.of(
+          FLIGHT_SQL_BEGIN_SAVEPOINT,
+          FLIGHT_SQL_BEGIN_TRANSACTION,
+          FLIGHT_SQL_CREATE_PREPARED_STATEMENT,
+          FLIGHT_SQL_CLOSE_PREPARED_STATEMENT,
+          FLIGHT_SQL_CREATE_PREPARED_SUBSTRAIT_PLAN,
+          FLIGHT_SQL_CANCEL_QUERY,
+          FLIGHT_SQL_END_SAVEPOINT,
+          FLIGHT_SQL_END_TRANSACTION);
 
   /**
    * Helper to parse {@link com.google.protobuf.Any} objects to the specific protobuf object.

From 2e83aa63d95a6fa380efdd5e5cb720a3154f9c93 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 22 Aug 2024 09:57:02 +0200
Subject: [PATCH 055/157] GH-43690: [Python][CI] Simplify
 python/requirements-wheel-test.txt file (#43691)

### Rationale for this change

The current [requirements-wheel-test.txt](https://github.com/apache/arrow/blob/7c8909a144f2e8d593dc8ad363ac95b2865b04ca/python/requirements-wheel-test.txt) file has quite complex and detailed version pinning, varying per architecture. I think this can be simplified because we just want to test with some older version of numpy and pandas (and the exact version isn't that important).

* GitHub Issue: #43690

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/requirements-wheel-test.txt | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/python/requirements-wheel-test.txt b/python/requirements-wheel-test.txt
index 46bedc13ba1..c7ff63e3395 100644
--- a/python/requirements-wheel-test.txt
+++ b/python/requirements-wheel-test.txt
@@ -5,22 +5,12 @@ pytest
 pytz
 tzdata; sys_platform == 'win32'
 
-numpy==1.21.3; platform_system == "Linux"   and platform_machine == "aarch64" and python_version < "3.11"
-numpy==1.23.4;                                                                    python_version == "3.11"
-numpy==1.26.0;                                                                    python_version >= "3.12"
-numpy==1.19.5; platform_system == "Linux"   and platform_machine != "aarch64" and python_version <  "3.9"
-numpy==1.21.3; platform_system == "Linux"   and platform_machine != "aarch64" and python_version >= "3.9" and python_version < "3.11"
-numpy==1.21.3; platform_system == "Darwin"  and platform_machine == "arm64"   and python_version <  "3.11"
-numpy==1.19.5; platform_system == "Darwin"  and platform_machine != "arm64"   and python_version <  "3.9"
-numpy==1.21.3; platform_system == "Darwin"  and platform_machine != "arm64"   and python_version >= "3.9" and python_version < "3.11"
-numpy==1.19.5; platform_system == "Windows"                                   and python_version <  "3.9"
-numpy==1.21.3; platform_system == "Windows"                                   and python_version >= "3.9" and python_version < "3.11"
+# We generally test with the oldest numpy version that supports a given Python
+# version. However, there is no need to make this strictly the oldest version,
+# so it can be broadened to have a single version specification across platforms.
+# (`~=x.y.z` specifies a compatible release as `>=x.y.z, == x.y.*`)
+numpy~=1.21.3; python_version < "3.11"
+numpy~=1.23.2; python_version == "3.11"
+numpy~=1.26.0; python_version == "3.12"
 
-pandas<1.1.0;  platform_system == "Linux"   and platform_machine != "aarch64" and python_version <  "3.8"
-pandas;        platform_system == "Linux"   and platform_machine != "aarch64" and python_version >= "3.8"
-pandas;        platform_system == "Linux"   and platform_machine == "aarch64"
-pandas<1.1.0;  platform_system == "Darwin"  and platform_machine != "arm64"   and python_version <  "3.8"
-pandas;        platform_system == "Darwin"  and platform_machine != "arm64"   and python_version >= "3.8"
-pandas;        platform_system == "Darwin"  and platform_machine == "arm64"
-pandas<1.1.0;  platform_system == "Windows"                                   and python_version <  "3.8"
-pandas;        platform_system == "Windows"                                   and python_version >= "3.8"
+pandas

From fc54eadb72791288fc9681bbcc6c8a9d8d6fff1d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 22 Aug 2024 11:28:01 +0200
Subject: [PATCH 056/157] GH-43785: [Python][CI] Correct PARQUET_TEST_DATA path
 in wheel tests (#43786)

### Rationale for this change

Starting with https://github.com/apache/arrow/pull/41580, the pyarrow tests now also rely on a file in the parquet-testing submodule. And the path to that directory is controlled by `PARQUET_TEST_DATA`, which appears to be set wrongly in the wheel test scripts, causing all wheel builds to fail at the moment.

* GitHub Issue: #43785

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 ci/scripts/python_wheel_unix_test.sh     | 2 +-
 ci/scripts/python_wheel_windows_test.bat | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh
index a25e5c51bdd..cf87a170567 100755
--- a/ci/scripts/python_wheel_unix_test.sh
+++ b/ci/scripts/python_wheel_unix_test.sh
@@ -54,7 +54,7 @@ export PYARROW_TEST_S3=${ARROW_S3}
 export PYARROW_TEST_TENSORFLOW=ON
 
 export ARROW_TEST_DATA=${source_dir}/testing/data
-export PARQUET_TEST_DATA=${source_dir}/submodules/parquet-testing/data
+export PARQUET_TEST_DATA=${source_dir}/cpp/submodules/parquet-testing/data
 
 if [ "${INSTALL_PYARROW}" == "ON" ]; then
   # Install the built wheels
diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat
index a928c3571d0..87c0bb12520 100755
--- a/ci/scripts/python_wheel_windows_test.bat
+++ b/ci/scripts/python_wheel_windows_test.bat
@@ -35,7 +35,7 @@ set PYARROW_TEST_TENSORFLOW=ON
 @REM set PYARROW_TEST_PANDAS=ON
 
 set ARROW_TEST_DATA=C:\arrow\testing\data
-set PARQUET_TEST_DATA=C:\arrow\submodules\parquet-testing\data
+set PARQUET_TEST_DATA=C:\arrow\cpp\submodules\parquet-testing\data
 
 @REM Install testing dependencies
 pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1

From b4f7efe5bdc2218bb595b130b4f65237caecfa76 Mon Sep 17 00:00:00 2001
From: Rok Mihevc <rok@mihevc.org>
Date: Thu, 22 Aug 2024 14:45:00 +0200
Subject: [PATCH 057/157] GH-43787: [C++] Register the new Opaque extension
 type by default (#43788)

This is to resolve #43787

> The Opaque extension type implementation for C++ (plus python bindings) was added in https://github.com/apache/arrow/pull/43458, but it was not registered by default, which we should do for canonical extension types (see https://github.com/apache/arrow/pull/43458#issuecomment-2302551404)

Additionally, this adds `bool8` extension type builds with `ARROW_JSON=false` as discussed [here](https://github.com/apache/arrow/commit/525881987d0b9b4f464c3e3593a9a7b4e3c767d0#r145613657)

### Rationale for this change

Canonical types should be registered by default if possible (except e.g. if they can't be compiled due to `ARROW_JSON=false`).

### What changes are included in this PR?

This adds default registration for `opaque`, changes when `bool8` is built and moves all canonical tests under the same test target.

### Are these changes tested?

Changes are tested by previously existing tests.

### Are there any user-facing changes?

`opaue` will now be registered by default and `bool8` will be present in case `ARROW_JSON=false` at build time.
* GitHub Issue: #43787

Authored-by: Rok Mihevc <rok@mihevc.org>
Signed-off-by: Rok Mihevc <rok@mihevc.org>
---
 cpp/src/arrow/CMakeLists.txt                 |  2 +-
 cpp/src/arrow/extension/CMakeLists.txt       | 18 ++++++-----------
 cpp/src/arrow/extension/bool8.h              |  2 ++
 cpp/src/arrow/extension/bool8_test.cc        |  1 -
 cpp/src/arrow/extension/fixed_shape_tensor.h |  2 ++
 cpp/src/arrow/extension/opaque.h             |  2 ++
 cpp/src/arrow/extension/opaque_test.cc       |  2 --
 cpp/src/arrow/extension_type.cc              | 21 ++++++++++++--------
 python/pyarrow/tests/test_extension_type.py  |  5 ++---
 9 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index fb7253b6fd6..89f28ee416e 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -374,6 +374,7 @@ set(ARROW_SRCS
     datum.cc
     device.cc
     extension_type.cc
+    extension/bool8.cc
     pretty_print.cc
     record_batch.cc
     result.cc
@@ -906,7 +907,6 @@ endif()
 
 if(ARROW_JSON)
   arrow_add_object_library(ARROW_JSON
-                           extension/bool8.cc
                            extension/fixed_shape_tensor.cc
                            extension/opaque.cc
                            json/options.cc
diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt
index fcd5fa529ab..5cb4bc77af2 100644
--- a/cpp/src/arrow/extension/CMakeLists.txt
+++ b/cpp/src/arrow/extension/CMakeLists.txt
@@ -15,22 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 
-add_arrow_test(test
-               SOURCES
-               bool8_test.cc
-               PREFIX
-               "arrow-extension-bool8")
+set(CANONICAL_EXTENSION_TESTS bool8_test.cc)
 
-add_arrow_test(test
-               SOURCES
-               fixed_shape_tensor_test.cc
-               PREFIX
-               "arrow-fixed-shape-tensor")
+if(ARROW_JSON)
+  list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc)
+endif()
 
 add_arrow_test(test
                SOURCES
-               opaque_test.cc
+               ${CANONICAL_EXTENSION_TESTS}
                PREFIX
-               "arrow-extension-opaque")
+               "arrow-canonical-extensions")
 
 arrow_install_all_headers("arrow/extension")
diff --git a/cpp/src/arrow/extension/bool8.h b/cpp/src/arrow/extension/bool8.h
index 02e629b28a8..fbb507639e2 100644
--- a/cpp/src/arrow/extension/bool8.h
+++ b/cpp/src/arrow/extension/bool8.h
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#pragma once
+
 #include "arrow/extension_type.h"
 
 namespace arrow::extension {
diff --git a/cpp/src/arrow/extension/bool8_test.cc b/cpp/src/arrow/extension/bool8_test.cc
index eabcfcf62d3..ee77332bc32 100644
--- a/cpp/src/arrow/extension/bool8_test.cc
+++ b/cpp/src/arrow/extension/bool8_test.cc
@@ -19,7 +19,6 @@
 #include "arrow/io/memory.h"
 #include "arrow/ipc/reader.h"
 #include "arrow/ipc/writer.h"
-#include "arrow/testing/extension_type.h"
 #include "arrow/testing/gtest_util.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h
index 20ec20a64c2..80a602021c6 100644
--- a/cpp/src/arrow/extension/fixed_shape_tensor.h
+++ b/cpp/src/arrow/extension/fixed_shape_tensor.h
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#pragma once
+
 #include "arrow/extension_type.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/extension/opaque.h b/cpp/src/arrow/extension/opaque.h
index 9814b391cba..5d3411798f8 100644
--- a/cpp/src/arrow/extension/opaque.h
+++ b/cpp/src/arrow/extension/opaque.h
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#pragma once
+
 #include "arrow/extension_type.h"
 #include "arrow/type.h"
 
diff --git a/cpp/src/arrow/extension/opaque_test.cc b/cpp/src/arrow/extension/opaque_test.cc
index 1629cdb3965..16fcba3fa6b 100644
--- a/cpp/src/arrow/extension/opaque_test.cc
+++ b/cpp/src/arrow/extension/opaque_test.cc
@@ -25,7 +25,6 @@
 #include "arrow/ipc/reader.h"
 #include "arrow/ipc/writer.h"
 #include "arrow/record_batch.h"
-#include "arrow/testing/extension_type.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type_fwd.h"
 #include "arrow/util/checked_cast.h"
@@ -169,7 +168,6 @@ TEST(OpaqueType, MetadataRoundTrip) {
 TEST(OpaqueType, BatchRoundTrip) {
   auto type = internal::checked_pointer_cast<extension::OpaqueType>(
       extension::opaque(binary(), "geometry", "adbc.postgresql"));
-  ExtensionTypeGuard guard(type);
 
   auto storage = ArrayFromJSON(binary(), R"(["foobar", null])");
   auto array = ExtensionType::WrapArray(type, storage);
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index 685018f7de7..83c7ebed4f3 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -27,9 +27,10 @@
 #include "arrow/array/util.h"
 #include "arrow/chunked_array.h"
 #include "arrow/config.h"
-#ifdef ARROW_JSON
 #include "arrow/extension/bool8.h"
+#ifdef ARROW_JSON
 #include "arrow/extension/fixed_shape_tensor.h"
+#include "arrow/extension/opaque.h"
 #endif
 #include "arrow/status.h"
 #include "arrow/type.h"
@@ -143,17 +144,21 @@ static std::once_flag registry_initialized;
 namespace internal {
 
 static void CreateGlobalRegistry() {
+  // Register canonical extension types
+
   g_registry = std::make_shared<ExtensionTypeRegistryImpl>();
+  std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8()};
 
 #ifdef ARROW_JSON
-  // Register canonical extension types
-  auto fst_ext_type =
-      checked_pointer_cast<ExtensionType>(extension::fixed_shape_tensor(int64(), {}));
-  ARROW_CHECK_OK(g_registry->RegisterType(fst_ext_type));
-
-  auto bool8_ext_type = checked_pointer_cast<ExtensionType>(extension::bool8());
-  ARROW_CHECK_OK(g_registry->RegisterType(bool8_ext_type));
+  ext_types.push_back(extension::fixed_shape_tensor(int64(), {}));
+  ext_types.push_back(extension::opaque(null(), "", ""));
 #endif
+
+  // Register canonical extension types
+  for (const auto& ext_type : ext_types) {
+    ARROW_CHECK_OK(
+        g_registry->RegisterType(checked_pointer_cast<ExtensionType>(ext_type)));
+  }
 }
 
 }  // namespace internal
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index b04ee85ec99..0d50c467e96 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1693,9 +1693,8 @@ def test_opaque_type(pickle_module, storage_type, storage):
     arr = pa.ExtensionArray.from_storage(opaque_type, storage)
     assert isinstance(arr, opaque_arr_class)
 
-    with registered_extension_type(opaque_type):
-        buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
-        batch = ipc_read_batch(buf)
+    buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
+    batch = ipc_read_batch(buf)
 
     assert batch.column(0).type.extension_name == "arrow.opaque"
     assert isinstance(batch.column(0), opaque_arr_class)

From 3e9384bbf4162ea060e867a753bce464b31e5e1c Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Thu, 22 Aug 2024 15:27:40 +0200
Subject: [PATCH 058/157] GH-43519: [Python] Set up wheel building for Python
 3.13 (#43539)

### Rationale for this change

Like #43519 mentionies, now that the first `rc` is out, it's probably time to add CI coverage for Python 3.13 (and also start building wheels).

### What changes are included in this PR?

I'm fairly new to the build/CI processes of the project, but I tried to follow the same template as #37901. I'll follow up afterwards with adding CI coverage for the free-threaded build as well.
* GitHub Issue: #43519

Lead-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 .env                                               |  2 +-
 ci/docker/python-wheel-manylinux-test.dockerfile   |  7 ++++---
 ci/docker/python-wheel-manylinux.dockerfile        |  2 +-
 .../python-wheel-windows-test-vs2019.dockerfile    |  7 ++++---
 ci/docker/python-wheel-windows-vs2019.dockerfile   |  7 ++++---
 ci/scripts/install_gcs_testbench.sh                | 10 +++++++---
 ci/scripts/install_python.sh                       | 14 +++++++++++---
 ci/scripts/python_wheel_macos_build.sh             |  2 --
 dev/release/verify-release-candidate.sh            |  6 +++---
 dev/tasks/python-wheels/github.linux.yml           |  5 +++++
 dev/tasks/python-wheels/github.osx.yml             |  2 +-
 dev/tasks/tasks.yml                                |  3 ++-
 docker-compose.yml                                 |  9 ++++++---
 python/pyproject.toml                              |  1 +
 python/requirements-wheel-build.txt                |  5 +++++
 python/requirements-wheel-test.txt                 |  7 +++++++
 16 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/.env b/.env
index 1358aafe824..21f904c3208 100644
--- a/.env
+++ b/.env
@@ -95,7 +95,7 @@ VCPKG="943c5ef1c8f6b5e6ced092b242c8299caae2ff01"    # 2024.04.26 Release
 # ci/docker/python-wheel-windows-vs2019.dockerfile.
 # This is a workaround for our CI problem that "archery docker build" doesn't
 # use pulled built images in dev/tasks/python-wheels/github.windows.yml.
-PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-06-18
+PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-08-06
 
 # Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker-compose run --rm conan".
 # See https://github.com/conan-io/conan-docker-tools#readme and
diff --git a/ci/docker/python-wheel-manylinux-test.dockerfile b/ci/docker/python-wheel-manylinux-test.dockerfile
index cdd0ae3ced7..443ff9c53cb 100644
--- a/ci/docker/python-wheel-manylinux-test.dockerfile
+++ b/ci/docker/python-wheel-manylinux-test.dockerfile
@@ -16,8 +16,8 @@
 # under the License.
 
 ARG arch
-ARG python
-FROM ${arch}/python:${python}
+ARG python_image_tag
+FROM ${arch}/python:${python_image_tag}
 
 # RUN pip install --upgrade pip
 
@@ -27,4 +27,5 @@ COPY python/requirements-wheel-test.txt /arrow/python/
 RUN pip install -r /arrow/python/requirements-wheel-test.txt
 
 COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/
-RUN PYTHON=python /arrow/ci/scripts/install_gcs_testbench.sh default
+ARG python
+RUN PYTHON_VERSION=${python} /arrow/ci/scripts/install_gcs_testbench.sh default
diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile
index cb39667af1e..42f088fd8a2 100644
--- a/ci/docker/python-wheel-manylinux.dockerfile
+++ b/ci/docker/python-wheel-manylinux.dockerfile
@@ -103,7 +103,7 @@ RUN vcpkg install \
 # Configure Python for applications running in the bash shell of this Dockerfile
 ARG python=3.8
 ENV PYTHON_VERSION=${python}
-RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-*) && \
+RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}) && \
     echo "export PATH=$PYTHON_ROOT/bin:\$PATH" >> /etc/profile.d/python.sh
 
 SHELL ["/bin/bash", "-i", "-c"]
diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
index 32bbb55e826..5f488a4c285 100644
--- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile
+++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
@@ -40,10 +40,11 @@ ARG python=3.8
 RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \
     (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \
     (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \
-    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.5" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
-    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.0" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts")
+    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
+    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \
+    (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts")
 
 # Install archiver to extract xz archives
-RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% & \
+RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% & \
     python -m pip install --no-cache-dir -U pip setuptools & \
     choco install --no-progress -r -y archiver
diff --git a/ci/docker/python-wheel-windows-vs2019.dockerfile b/ci/docker/python-wheel-windows-vs2019.dockerfile
index ff42de939d9..5a17e3e4c52 100644
--- a/ci/docker/python-wheel-windows-vs2019.dockerfile
+++ b/ci/docker/python-wheel-windows-vs2019.dockerfile
@@ -83,9 +83,10 @@ ARG python=3.8
 RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \
     (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \
     (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \
-    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.5" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
-    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.0" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts")
-RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION%
+    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
+    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \
+    (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts")
+RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION%
 RUN python -m pip install -U pip setuptools
 
 COPY python/requirements-wheel-build.txt arrow/python/
diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh
index 2090290c993..5471b3cc238 100755
--- a/ci/scripts/install_gcs_testbench.sh
+++ b/ci/scripts/install_gcs_testbench.sh
@@ -41,8 +41,12 @@ version=$1
 if [[ "${version}" -eq "default" ]]; then
   version="v0.39.0"
   # Latests versions of Testbench require newer setuptools
-  ${PYTHON:-python3} -m pip install --upgrade setuptools
+  python3 -m pip install --upgrade setuptools
 fi
 
-${PYTHON:-python3} -m pip install \
-  "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
+# This script is run with PYTHON undefined in some places,
+# but those only use older pythons.
+if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then
+  python3 -m pip install \
+    "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
+fi
diff --git a/ci/scripts/install_python.sh b/ci/scripts/install_python.sh
index 5f962f02b91..42d0e9ca179 100755
--- a/ci/scripts/install_python.sh
+++ b/ci/scripts/install_python.sh
@@ -28,8 +28,9 @@ declare -A versions
 versions=([3.8]=3.8.10
           [3.9]=3.9.13
           [3.10]=3.10.11
-          [3.11]=3.11.5
-          [3.12]=3.12.0)
+          [3.11]=3.11.9
+          [3.12]=3.12.4
+          [3.13]=3.13.0)
 
 if [ "$#" -ne 2 ]; then
   echo "Usage: $0 <platform> <version>"
@@ -46,7 +47,14 @@ full_version=${versions[$2]}
 if [ $platform = "macOS" ]; then
     echo "Downloading Python installer..."
 
-    if [ "$(uname -m)" = "arm64" ] || [ "$version" = "3.10" ] || [ "$version" = "3.11" ] || [ "$version" = "3.12" ]; then
+    if [ "$version" = "3.13" ];
+    then
+        fname="python-${full_version}rc1-macos11.pkg"
+    elif [ "$(uname -m)" = "arm64" ] || \
+         [ "$version" = "3.10" ] || \
+         [ "$version" = "3.11" ] || \
+         [ "$version" = "3.12" ];
+    then
         fname="python-${full_version}-macos11.pkg"
     else
         fname="python-${full_version}-macosx10.9.pkg"
diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh
index 3ed9d5d8dd1..d5430f26748 100755
--- a/ci/scripts/python_wheel_macos_build.sh
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -48,13 +48,11 @@ fi
 
 echo "=== (${PYTHON_VERSION}) Install Python build dependencies ==="
 export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
-export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}"
 
 pip install \
   --upgrade \
   --only-binary=:all: \
   --target $PIP_SITE_PACKAGES \
-  --platform $PIP_TARGET_PLATFORM \
   -r ${source_dir}/python/requirements-wheel-build.txt
 pip install "delocate>=0.10.3"
 
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 6a36109dc2f..07e765a759e 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -1146,7 +1146,7 @@ test_linux_wheels() {
     local arch="x86_64"
   fi
 
-  local python_versions="${TEST_PYTHON_VERSIONS:-3.8 3.9 3.10 3.11 3.12}"
+  local python_versions="${TEST_PYTHON_VERSIONS:-3.8 3.9 3.10 3.11 3.12 3.13}"
   local platform_tags="${TEST_WHEEL_PLATFORM_TAGS:-manylinux_2_17_${arch}.manylinux2014_${arch} manylinux_2_28_${arch}}"
 
   for python in ${python_versions}; do
@@ -1170,11 +1170,11 @@ test_macos_wheels() {
 
   # apple silicon processor
   if [ "$(uname -m)" = "arm64" ]; then
-    local python_versions="3.8 3.9 3.10 3.11 3.12"
+    local python_versions="3.8 3.9 3.10 3.11 3.12 3.13"
     local platform_tags="macosx_11_0_arm64"
     local check_flight=OFF
   else
-    local python_versions="3.8 3.9 3.10 3.11 3.12"
+    local python_versions="3.8 3.9 3.10 3.11 3.12 3.13"
     local platform_tags="macosx_10_15_x86_64"
   fi
 
diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml
index 968c5da2189..2854d4349fb 100644
--- a/dev/tasks/python-wheels/github.linux.yml
+++ b/dev/tasks/python-wheels/github.linux.yml
@@ -36,6 +36,11 @@ jobs:
       ARCHERY_USE_LEGACY_DOCKER_COMPOSE: 1
       {% endif %}
       PYTHON: "{{ python_version }}"
+      {% if python_version == "3.13" %}
+      PYTHON_IMAGE_TAG: "3.13-rc"
+      {% else %}
+      PYTHON_IMAGE_TAG: "{{ python_version }}"
+      {% endif %}
 
     steps:
       {{ macros.github_checkout_arrow()|indent }}
diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml
index 8ceb468af89..b26aeba32b7 100644
--- a/dev/tasks/python-wheels/github.osx.yml
+++ b/dev/tasks/python-wheels/github.osx.yml
@@ -121,7 +121,7 @@ jobs:
           source test-env/bin/activate
           pip install --upgrade pip wheel
           arch -{{ arch }} pip install -r arrow/python/requirements-wheel-test.txt
-          PYTHON=python arch -{{ arch }} arrow/ci/scripts/install_gcs_testbench.sh default
+          PYTHON_VERSION={{ python_version }} arch -{{ arch }} arrow/ci/scripts/install_gcs_testbench.sh default
           arch -{{ arch }} arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow
 
       {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }}
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index fe02fe9ce68..60114d69308 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -389,7 +389,8 @@ tasks:
                                                ("3.9", "cp39", "cp39"),
                                                ("3.10", "cp310", "cp310"),
                                                ("3.11", "cp311", "cp311"),
-                                               ("3.12", "cp312", "cp312")] %}
+                                               ("3.12", "cp312", "cp312"),
+                                               ("3.13", "cp313", "cp313")] %}
 
 {############################## Wheel Linux ##################################}
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 14eeeeee6e5..3045cf015bc 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1096,9 +1096,10 @@ services:
       args:
         arch: ${ARCH}
         arch_short: ${ARCH_SHORT}
-        base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2024-02-04-ea37246
+        base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2024-08-03-32dfa47
         vcpkg: ${VCPKG}
         python: ${PYTHON}
+        python_image_tag: ${PYTHON_IMAGE_TAG}
         manylinux: 2014
       context: .
       dockerfile: ci/docker/python-wheel-manylinux.dockerfile
@@ -1119,9 +1120,10 @@ services:
       args:
         arch: ${ARCH}
         arch_short: ${ARCH_SHORT}
-        base: quay.io/pypa/manylinux_2_28_${ARCH_ALIAS}:2024-02-04-ea37246
+        base: quay.io/pypa/manylinux_2_28_${ARCH_ALIAS}:2024-08-03-32dfa47
         vcpkg: ${VCPKG}
         python: ${PYTHON}
+        python_image_tag: ${PYTHON_IMAGE_TAG}
         manylinux: 2_28
       context: .
       dockerfile: ci/docker/python-wheel-manylinux.dockerfile
@@ -1135,7 +1137,7 @@ services:
     command: /arrow/ci/scripts/python_wheel_manylinux_build.sh
 
   python-wheel-manylinux-test-imports:
-    image: ${ARCH}/python:${PYTHON}
+    image: ${ARCH}/python:${PYTHON_IMAGE_TAG}
     shm_size: 2G
     volumes:
       - .:/arrow:delegated
@@ -1151,6 +1153,7 @@ services:
       args:
         arch: ${ARCH}
         python: ${PYTHON}
+        python_image_tag: ${PYTHON_IMAGE_TAG}
       context: .
       dockerfile: ci/docker/python-wheel-manylinux-test.dockerfile
       cache_from:
diff --git a/python/pyproject.toml b/python/pyproject.toml
index d863bb3e5f0..8ece65dd467 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -48,6 +48,7 @@ classifiers  = [
     'Programming Language :: Python :: 3.10',
     'Programming Language :: Python :: 3.11',
     'Programming Language :: Python :: 3.12',
+    'Programming Language :: Python :: 3.13',
 ]
 maintainers = [
     {name = "Apache Arrow Developers", email = "dev@arrow.apache.org"}
diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt
index faa078d3d7f..2d448004768 100644
--- a/python/requirements-wheel-build.txt
+++ b/python/requirements-wheel-build.txt
@@ -1,3 +1,8 @@
+# Remove pre and extra index url once there's NumPy and Cython wheels for 3.13
+# on PyPI
+--pre
+--extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
+
 cython>=0.29.31
 oldest-supported-numpy>=0.14; python_version<'3.9'
 numpy>=2.0.0; python_version>='3.9'
diff --git a/python/requirements-wheel-test.txt b/python/requirements-wheel-test.txt
index c7ff63e3395..98ec2bd4fd4 100644
--- a/python/requirements-wheel-test.txt
+++ b/python/requirements-wheel-test.txt
@@ -1,3 +1,9 @@
+# Remove pre and extra index url once there's NumPy and Cython wheels for 3.13
+# on PyPI
+--pre
+--prefer-binary
+--extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
+
 cffi
 cython
 hypothesis
@@ -12,5 +18,6 @@ tzdata; sys_platform == 'win32'
 numpy~=1.21.3; python_version < "3.11"
 numpy~=1.23.2; python_version == "3.11"
 numpy~=1.26.0; python_version == "3.12"
+numpy~=2.1.0; python_version >= "3.13"
 
 pandas

From 88d57cf41fde20adf14adca02e02d2cb92c83443 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Thu, 22 Aug 2024 08:45:19 -0500
Subject: [PATCH 059/157] MINOR: [CI][R] Undo #43636 now that the action is
 approved (#43730)

Undo the pinning in #43636 now that INFRA has approved the quarto-dev action

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .github/workflows/r.yml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index bf7eb99e7e9..2820d42470b 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -86,19 +86,18 @@ jobs:
       run: |
         sudo apt-get install devscripts
 
-    # replace the SHA with v2 once INFRA-26031 is resolved
-    - uses: r-lib/actions/setup-r@732fb28088814627972f1ccbacc02561178cf391
+    - uses: r-lib/actions/setup-r@v2
       with:
         use-public-rspm: true
         install-r: false
 
-    - uses: r-lib/actions/setup-r-dependencies@732fb28088814627972f1ccbacc02561178cf391
+    - uses: r-lib/actions/setup-r-dependencies@v2
       with:
         extra-packages: any::rcmdcheck
         needs: check
         working-directory: src/r
 
-    - uses: r-lib/actions/check-r-package@732fb28088814627972f1ccbacc02561178cf391
+    - uses: r-lib/actions/check-r-package@v2
       with:
         working-directory: src/r
       env:
@@ -341,11 +340,11 @@ jobs:
           cd r/windows
           ls *.zip | xargs -n 1 unzip -uo
           rm -rf *.zip
-      - uses: r-lib/actions/setup-r@732fb28088814627972f1ccbacc02561178cf391
+      - uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ matrix.config.rversion }}
           Ncpus: 2
-      - uses: r-lib/actions/setup-r-dependencies@732fb28088814627972f1ccbacc02561178cf391
+      - uses: r-lib/actions/setup-r-dependencies@v2
         env:
           GITHUB_PAT: "${{ github.token }}"
         with:

From 2e33e98f583035cd686455870e9cbf5fb6dc9966 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Thu, 22 Aug 2024 08:26:37 -0800
Subject: [PATCH 060/157] MINOR: [GO] fixup test case name in cast_test.go
 (#43780)

---
 go/arrow/compute/cast_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go/arrow/compute/cast_test.go b/go/arrow/compute/cast_test.go
index 2e748a2fee9..fa08467dd39 100644
--- a/go/arrow/compute/cast_test.go
+++ b/go/arrow/compute/cast_test.go
@@ -2636,7 +2636,7 @@ func (c *CastSuite) TestStructToDifferentNullabilityStruct() {
 		defer dest3Nullable.Release()
 		checkCast(c.T(), srcNonNull, dest3Nullable, *compute.DefaultCastOptions(true))
 	})
-	c.Run("non-nullable to nullable", func() {
+	c.Run("nullable to non-nullable", func() {
 		fieldsSrcNullable := []arrow.Field{
 			{Name: "a", Type: arrow.PrimitiveTypes.Int8, Nullable: true},
 			{Name: "b", Type: arrow.PrimitiveTypes.Int8, Nullable: true},

From 76e0f6254b75509d83e44fe8997bd14007907c4f Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Thu, 22 Aug 2024 15:37:09 -0400
Subject: [PATCH 061/157] GH-43764: [Go][FlightSQL] Add NewPreparedStatement
 function (#43781)

### Rationale for this change
Allowing creation of the prepared statement object outside of the client allows for logging, proxying, and handing off prepared statements if necessary.

### Are these changes tested?
Yes

* GitHub Issue: #43764

Authored-by: Matt Topol <zotthewizard@gmail.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
---
 go/arrow/flight/flightsql/client.go      |  9 +++++++++
 go/arrow/flight/flightsql/client_test.go | 21 +++++++++++++++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/go/arrow/flight/flightsql/client.go b/go/arrow/flight/flightsql/client.go
index 4a600e5253e..4c9dc501351 100644
--- a/go/arrow/flight/flightsql/client.go
+++ b/go/arrow/flight/flightsql/client.go
@@ -1102,6 +1102,15 @@ type PreparedStatement struct {
 	closed        bool
 }
 
+// NewPreparedStatement creates a prepared statement object bound to the provided
+// client using the given handle. In general, it should be sufficient to use the
+// Prepare function a client and this wouldn't be needed. But this can be used
+// to propagate a prepared statement from one client to another if needed or if
+// proxying requests.
+func NewPreparedStatement(client *Client, handle []byte) *PreparedStatement {
+	return &PreparedStatement{client: client, handle: handle}
+}
+
 // Execute executes the prepared statement on the server and returns a FlightInfo
 // indicating where to retrieve the response. If SetParameters has been called
 // then the parameter bindings will be sent before execution.
diff --git a/go/arrow/flight/flightsql/client_test.go b/go/arrow/flight/flightsql/client_test.go
index 7604b554cbc..d060161f94f 100644
--- a/go/arrow/flight/flightsql/client_test.go
+++ b/go/arrow/flight/flightsql/client_test.go
@@ -378,8 +378,10 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecute() {
 	createRsp := &mockDoActionClient{}
 	defer createRsp.AssertExpectations(s.T())
 	createRsp.On("Recv").Return(&pb.Result{Body: data}, nil).Once()
-	createRsp.On("Recv").Return(&pb.Result{}, io.EOF)
-	createRsp.On("CloseSend").Return(nil)
+	createRsp.On("Recv").Return(&pb.Result{}, io.EOF).Once()
+	createRsp.On("Recv").Return(&pb.Result{Body: data}, nil).Once()
+	createRsp.On("Recv").Return(&pb.Result{}, io.EOF).Once()
+	createRsp.On("CloseSend").Return(nil).Twice()
 
 	closeRsp := &mockDoActionClient{}
 	defer closeRsp.AssertExpectations(s.T())
@@ -387,13 +389,13 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecute() {
 	closeRsp.On("CloseSend").Return(nil)
 
 	s.mockClient.On("DoAction", flightsql.CreatePreparedStatementActionType, action.Body, s.callOpts).
-		Return(createRsp, nil)
+		Return(createRsp, nil).Twice()
 	s.mockClient.On("DoAction", flightsql.ClosePreparedStatementActionType, closeAct.Body, s.callOpts).
 		Return(closeRsp, nil)
 
 	infoCmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(query)}
 	desc := getDesc(infoCmd)
-	s.mockClient.On("GetFlightInfo", desc.Type, desc.Cmd, s.callOpts).Return(&emptyFlightInfo, nil)
+	s.mockClient.On("GetFlightInfo", desc.Type, desc.Cmd, s.callOpts).Return(&emptyFlightInfo, nil).Twice()
 
 	prepared, err := s.sqlClient.Prepare(context.TODO(), query, s.callOpts...)
 	s.NoError(err)
@@ -404,6 +406,17 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecute() {
 	info, err := prepared.Execute(context.TODO(), s.callOpts...)
 	s.NoError(err)
 	s.Equal(&emptyFlightInfo, info)
+
+	prepared, err = s.sqlClient.Prepare(context.TODO(), query, s.callOpts...)
+	s.NoError(err)
+
+	secondPrepare := flightsql.NewPreparedStatement(&s.sqlClient, prepared.Handle())
+	s.Equal(string(secondPrepare.Handle()), "query")
+	defer secondPrepare.Close(context.TODO(), s.callOpts...)
+
+	info, err = secondPrepare.Execute(context.TODO(), s.callOpts...)
+	s.NoError(err)
+	s.Equal(&emptyFlightInfo, info)
 }
 
 func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() {

From d47b305bbce037af18ce65dc968074fe1681b4d4 Mon Sep 17 00:00:00 2001
From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Date: Thu, 22 Aug 2024 16:04:59 -0400
Subject: [PATCH 062/157] GH-43624: [Go] Add JSON/UUID extension types, extend
 arrow -> parquet logical type mapping (#43679)

### Rationale for this change

- Missing `JSON` extension type implementation.
- Current precedent in C++ (and thereby PyArrow) is that canonical extension types do not require manual registration.
- Issues like #43640 and #43624 suggest that we need to expose ways of configuring parquet types written from arrow records, but casting the underlying data presents challenges for a generalized approach.

### What changes are included in this PR?

- Move `UUIDType` from `internal` to `arrow/extensions`
- Implement `JSON` canonical extension type
- Automatically register all canonical extension types at initialization
  - remove register/unregister from various locations these extension types are used
- Add new `CustomParquetType` interface so extension types can specify their target `LogicalType` in Parquet
- Refactor parquet `fieldToNode` to split up `PrimitiveNode` type mapping for leaves from `GroupNode` composition
- Simplify parquet `LogicalType` to use only value receivers

### Are these changes tested?

Yes

### Are there any user-facing changes?

- `UUID` and `JSON` extension types are available to end users.
- Canonical extension types will automatically be recognized in IPC without registration.
- Users with their own extension type implementations may use the `CustomParquetType` interface to control Parquet conversion without needing to fork or upstream the change.

* GitHub Issue: #43624

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Joel Lubinitsky <joellubi@gmail.com>
---
 docs/source/status.rst                        |   6 +
 go/arrow/array/array_test.go                  |   4 +-
 go/arrow/array/diff_test.go                   |   4 +-
 go/arrow/array/extension_test.go              |  10 -
 go/arrow/avro/reader_types.go                 |   4 +-
 go/arrow/avro/schema.go                       |   4 +-
 go/arrow/compute/exec/span_test.go            |   6 +-
 go/arrow/csv/reader_test.go                   |   4 +-
 go/arrow/csv/writer_test.go                   |   6 +-
 go/arrow/datatype_extension_test.go           |  18 +-
 go/arrow/extensions/bool8_test.go             |   3 -
 go/arrow/extensions/extensions.go             |  36 +++
 go/arrow/extensions/json.go                   | 148 ++++++++++
 go/arrow/extensions/json_test.go              | 268 ++++++++++++++++++
 go/arrow/extensions/opaque_test.go            |   3 -
 go/arrow/extensions/uuid.go                   | 265 +++++++++++++++++
 go/arrow/extensions/uuid_test.go              | 257 +++++++++++++++++
 .../internal/flight_integration/scenario.go   |   4 -
 .../cmd/arrow-json-integration-test/main.go   |   4 -
 go/arrow/ipc/metadata_test.go                 |  11 +-
 go/internal/types/extension_types.go          | 227 +--------------
 go/internal/types/extension_types_test.go     |  95 -------
 go/parquet/cmd/parquet_reader/main.go         |   2 +-
 go/parquet/metadata/app_version.go            |   2 +-
 go/parquet/pqarrow/encode_arrow_test.go       |  82 ++++--
 go/parquet/pqarrow/path_builder_test.go       |   6 +-
 go/parquet/pqarrow/schema.go                  | 228 +++++++--------
 go/parquet/pqarrow/schema_test.go             |  15 +-
 go/parquet/schema/converted_types.go          |   8 +-
 go/parquet/schema/logical_types.go            |  30 +-
 go/parquet/schema/logical_types_test.go       |  40 +--
 go/parquet/schema/schema_element_test.go      |   4 +-
 32 files changed, 1221 insertions(+), 583 deletions(-)
 create mode 100644 go/arrow/extensions/extensions.go
 create mode 100644 go/arrow/extensions/json.go
 create mode 100644 go/arrow/extensions/json_test.go
 create mode 100644 go/arrow/extensions/uuid.go
 create mode 100644 go/arrow/extensions/uuid_test.go
 delete mode 100644 go/internal/types/extension_types_test.go

diff --git a/docs/source/status.rst b/docs/source/status.rst
index c232aa280be..5e2c2cc19c8 100644
--- a/docs/source/status.rst
+++ b/docs/source/status.rst
@@ -119,6 +119,12 @@ Data Types
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
 | Variable shape tensor |       |       |       |            |       |       |       |       |
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
+| JSON                  |       |       | ✓     |            |       |       |       |       |
++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
+| UUID                  |       |       | ✓     |            |       |       |       |       |
++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
+| 8-bit Boolean         | ✓     |       | ✓     |            |       |       |       |       |
++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
 
 Notes:
 
diff --git a/go/arrow/array/array_test.go b/go/arrow/array/array_test.go
index 4d83766b4fa..4f0627c6000 100644
--- a/go/arrow/array/array_test.go
+++ b/go/arrow/array/array_test.go
@@ -21,9 +21,9 @@ import (
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/internal/testing/tools"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"github.com/stretchr/testify/assert"
 )
 
@@ -122,7 +122,7 @@ func TestMakeFromData(t *testing.T) {
 		{name: "dictionary", d: &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Uint64, ValueType: &testDataType{arrow.TIMESTAMP}}, dict: array.NewData(&testDataType{arrow.TIMESTAMP}, 0 /* length */, make([]*memory.Buffer, 2 /*null bitmap, values*/), nil /* childData */, 0 /* nulls */, 0 /* offset */)},
 
 		{name: "extension", d: &testDataType{arrow.EXTENSION}, expPanic: true, expError: "arrow/array: DataType for ExtensionArray must implement arrow.ExtensionType"},
-		{name: "extension", d: types.NewUUIDType()},
+		{name: "extension", d: extensions.NewUUIDType()},
 
 		{name: "run end encoded", d: arrow.RunEndEncodedOf(arrow.PrimitiveTypes.Int64, arrow.PrimitiveTypes.Int64), child: []arrow.ArrayData{
 			array.NewData(&testDataType{arrow.INT64}, 0 /* length */, make([]*memory.Buffer, 2 /*null bitmap, values*/), nil /* childData */, 0 /* nulls */, 0 /* offset */),
diff --git a/go/arrow/array/diff_test.go b/go/arrow/array/diff_test.go
index 65d212be118..9c9ce6a53ae 100644
--- a/go/arrow/array/diff_test.go
+++ b/go/arrow/array/diff_test.go
@@ -25,9 +25,9 @@ import (
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/memory"
 	"github.com/apache/arrow/go/v18/internal/json"
-	"github.com/apache/arrow/go/v18/internal/types"
 )
 
 type diffTestCase struct {
@@ -861,7 +861,7 @@ func TestEdits_UnifiedDiff(t *testing.T) {
 		},
 		{
 			name:       "extensions",
-			dataType:   types.NewUUIDType(),
+			dataType:   extensions.NewUUIDType(),
 			baseJSON:   `["00000000-0000-0000-0000-000000000000", "00000000-0000-0000-0000-000000000001"]`,
 			targetJSON: `["00000000-0000-0000-0000-000000000001", "00000000-0000-0000-0000-000000000002"]`,
 			want: `@@ -0, +0 @@
diff --git a/go/arrow/array/extension_test.go b/go/arrow/array/extension_test.go
index 71ea9f105af..26245cf015d 100644
--- a/go/arrow/array/extension_test.go
+++ b/go/arrow/array/extension_test.go
@@ -30,16 +30,6 @@ type ExtensionTypeTestSuite struct {
 	suite.Suite
 }
 
-func (e *ExtensionTypeTestSuite) SetupTest() {
-	e.NoError(arrow.RegisterExtensionType(types.NewUUIDType()))
-}
-
-func (e *ExtensionTypeTestSuite) TearDownTest() {
-	if arrow.GetExtensionType("uuid") != nil {
-		e.NoError(arrow.UnregisterExtensionType("uuid"))
-	}
-}
-
 func (e *ExtensionTypeTestSuite) TestParametricEquals() {
 	p1Type := types.NewParametric1Type(6)
 	p2Type := types.NewParametric1Type(6)
diff --git a/go/arrow/avro/reader_types.go b/go/arrow/avro/reader_types.go
index e07cd380d51..dab2b33dce6 100644
--- a/go/arrow/avro/reader_types.go
+++ b/go/arrow/avro/reader_types.go
@@ -27,8 +27,8 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/array"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/decimal256"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 )
 
 type dataLoader struct {
@@ -436,7 +436,7 @@ func mapFieldBuilders(b array.Builder, field arrow.Field, parent *fieldPos) {
 			}
 			return nil
 		}
-	case *types.UUIDBuilder:
+	case *extensions.UUIDBuilder:
 		f.appendFunc = func(data interface{}) error {
 			switch dt := data.(type) {
 			case nil:
diff --git a/go/arrow/avro/schema.go b/go/arrow/avro/schema.go
index 007dad06c19..a6de3718d3c 100644
--- a/go/arrow/avro/schema.go
+++ b/go/arrow/avro/schema.go
@@ -24,7 +24,7 @@ import (
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
-	"github.com/apache/arrow/go/v18/internal/types"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/internal/utils"
 	avro "github.com/hamba/avro/v2"
 )
@@ -349,7 +349,7 @@ func avroLogicalToArrowField(n *schemaNode) {
 		// The uuid logical type represents a random generated universally unique identifier (UUID).
 		// A uuid logical type annotates an Avro string. The string has to conform with RFC-4122
 	case "uuid":
-		dt = types.NewUUIDType()
+		dt = extensions.NewUUIDType()
 
 	// The date logical type represents a date within the calendar, with no reference to a particular
 	// time zone or time of day.
diff --git a/go/arrow/compute/exec/span_test.go b/go/arrow/compute/exec/span_test.go
index f5beb45ee14..018fbb7d623 100644
--- a/go/arrow/compute/exec/span_test.go
+++ b/go/arrow/compute/exec/span_test.go
@@ -29,6 +29,7 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/compute/exec"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/endian"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/memory"
 	"github.com/apache/arrow/go/v18/arrow/scalar"
 	"github.com/apache/arrow/go/v18/internal/types"
@@ -192,9 +193,6 @@ func TestArraySpan_NumBuffers(t *testing.T) {
 		Children []exec.ArraySpan
 	}
 
-	arrow.RegisterExtensionType(types.NewUUIDType())
-	defer arrow.UnregisterExtensionType("uuid")
-
 	tests := []struct {
 		name   string
 		fields fields
@@ -207,7 +205,7 @@ func TestArraySpan_NumBuffers(t *testing.T) {
 		{"large binary", fields{Type: arrow.BinaryTypes.LargeBinary}, 3},
 		{"string", fields{Type: arrow.BinaryTypes.String}, 3},
 		{"large string", fields{Type: arrow.BinaryTypes.LargeString}, 3},
-		{"extension", fields{Type: types.NewUUIDType()}, 2},
+		{"extension", fields{Type: extensions.NewUUIDType()}, 2},
 		{"int32", fields{Type: arrow.PrimitiveTypes.Int32}, 2},
 	}
 	for _, tt := range tests {
diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go
index b0775b9b11a..6a89d497042 100644
--- a/go/arrow/csv/reader_test.go
+++ b/go/arrow/csv/reader_test.go
@@ -30,8 +30,8 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/csv"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/decimal256"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@@ -356,7 +356,7 @@ func testCSVReader(t *testing.T, filepath string, withHeader bool, stringsCanBeN
 			{Name: "binary", Type: arrow.BinaryTypes.Binary},
 			{Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary},
 			{Name: "fixed_size_binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 3}},
-			{Name: "uuid", Type: types.NewUUIDType()},
+			{Name: "uuid", Type: extensions.NewUUIDType()},
 			{Name: "date32", Type: arrow.PrimitiveTypes.Date32},
 			{Name: "date64", Type: arrow.PrimitiveTypes.Date64},
 		},
diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go
index be9ab961c3e..2ae01a6d490 100644
--- a/go/arrow/csv/writer_test.go
+++ b/go/arrow/csv/writer_test.go
@@ -31,9 +31,9 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/csv"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/decimal256"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/float16"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"github.com/google/uuid"
 )
 
@@ -230,7 +230,7 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo
 			{Name: "binary", Type: arrow.BinaryTypes.Binary},
 			{Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary},
 			{Name: "fixed_size_binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 3}},
-			{Name: "uuid", Type: types.NewUUIDType()},
+			{Name: "uuid", Type: extensions.NewUUIDType()},
 			{Name: "null", Type: arrow.Null},
 		},
 		nil,
@@ -285,7 +285,7 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo
 	b.Field(22).(*array.BinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil)
 	b.Field(23).(*array.BinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil)
 	b.Field(24).(*array.FixedSizeBinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil)
-	b.Field(25).(*types.UUIDBuilder).AppendValues([]uuid.UUID{uuid.MustParse("00000000-0000-0000-0000-000000000001"), uuid.MustParse("00000000-0000-0000-0000-000000000002"), uuid.MustParse("00000000-0000-0000-0000-000000000003")}, nil)
+	b.Field(25).(*extensions.UUIDBuilder).AppendValues([]uuid.UUID{uuid.MustParse("00000000-0000-0000-0000-000000000001"), uuid.MustParse("00000000-0000-0000-0000-000000000002"), uuid.MustParse("00000000-0000-0000-0000-000000000003")}, nil)
 	b.Field(26).(*array.NullBuilder).AppendEmptyValues(3)
 
 	for _, field := range b.Fields() {
diff --git a/go/arrow/datatype_extension_test.go b/go/arrow/datatype_extension_test.go
index c3e595f523e..7244d377bd2 100644
--- a/go/arrow/datatype_extension_test.go
+++ b/go/arrow/datatype_extension_test.go
@@ -21,7 +21,7 @@ import (
 	"testing"
 
 	"github.com/apache/arrow/go/v18/arrow"
-	"github.com/apache/arrow/go/v18/internal/types"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 )
@@ -50,24 +50,14 @@ type ExtensionTypeTestSuite struct {
 	suite.Suite
 }
 
-func (e *ExtensionTypeTestSuite) SetupTest() {
-	e.NoError(arrow.RegisterExtensionType(types.NewUUIDType()))
-}
-
-func (e *ExtensionTypeTestSuite) TearDownTest() {
-	if arrow.GetExtensionType("uuid") != nil {
-		e.NoError(arrow.UnregisterExtensionType("uuid"))
-	}
-}
-
 func (e *ExtensionTypeTestSuite) TestExtensionType() {
 	e.Nil(arrow.GetExtensionType("uuid-unknown"))
-	e.NotNil(arrow.GetExtensionType("uuid"))
+	e.NotNil(arrow.GetExtensionType("arrow.uuid"))
 
-	e.Error(arrow.RegisterExtensionType(types.NewUUIDType()))
+	e.Error(arrow.RegisterExtensionType(extensions.NewUUIDType()))
 	e.Error(arrow.UnregisterExtensionType("uuid-unknown"))
 
-	typ := types.NewUUIDType()
+	typ := extensions.NewUUIDType()
 	e.Implements((*arrow.ExtensionType)(nil), typ)
 	e.Equal(arrow.EXTENSION, typ.ID())
 	e.Equal("extension", typ.Name())
diff --git a/go/arrow/extensions/bool8_test.go b/go/arrow/extensions/bool8_test.go
index 9f7365d1555..ff129e24bc8 100644
--- a/go/arrow/extensions/bool8_test.go
+++ b/go/arrow/extensions/bool8_test.go
@@ -178,9 +178,6 @@ func TestReinterpretStorageEqualToValues(t *testing.T) {
 
 func TestBool8TypeBatchIPCRoundTrip(t *testing.T) {
 	typ := extensions.NewBool8Type()
-	arrow.RegisterExtensionType(typ)
-	defer arrow.UnregisterExtensionType(typ.ExtensionName())
-
 	storage, _, err := array.FromJSON(memory.DefaultAllocator, arrow.PrimitiveTypes.Int8,
 		strings.NewReader(`[-1, 0, 1, 2, null]`))
 	require.NoError(t, err)
diff --git a/go/arrow/extensions/extensions.go b/go/arrow/extensions/extensions.go
new file mode 100644
index 00000000000..03c6923e95f
--- /dev/null
+++ b/go/arrow/extensions/extensions.go
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions
+
+import (
+	"github.com/apache/arrow/go/v18/arrow"
+)
+
+var canonicalExtensionTypes = []arrow.ExtensionType{
+	&Bool8Type{},
+	&UUIDType{},
+	&OpaqueType{},
+	&JSONType{},
+}
+
+func init() {
+	for _, extType := range canonicalExtensionTypes {
+		if err := arrow.RegisterExtensionType(extType); err != nil {
+			panic(err)
+		}
+	}
+}
diff --git a/go/arrow/extensions/json.go b/go/arrow/extensions/json.go
new file mode 100644
index 00000000000..12c49f9c0a7
--- /dev/null
+++ b/go/arrow/extensions/json.go
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions
+
+import (
+	"fmt"
+	"reflect"
+	"slices"
+
+	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/internal/json"
+	"github.com/apache/arrow/go/v18/parquet/schema"
+)
+
+var jsonSupportedStorageTypes = []arrow.DataType{
+	arrow.BinaryTypes.String,
+	arrow.BinaryTypes.LargeString,
+	arrow.BinaryTypes.StringView,
+}
+
+// JSONType represents a UTF-8 encoded JSON string as specified in RFC8259.
+type JSONType struct {
+	arrow.ExtensionBase
+}
+
+// ParquetLogicalType implements pqarrow.ExtensionCustomParquetType.
+func (b *JSONType) ParquetLogicalType() schema.LogicalType {
+	return schema.JSONLogicalType{}
+}
+
+// NewJSONType creates a new JSONType with the specified storage type.
+// storageType must be one of String, LargeString, StringView.
+func NewJSONType(storageType arrow.DataType) (*JSONType, error) {
+	if !slices.Contains(jsonSupportedStorageTypes, storageType) {
+		return nil, fmt.Errorf("unsupported storage type for JSON extension type: %s", storageType)
+	}
+	return &JSONType{ExtensionBase: arrow.ExtensionBase{Storage: storageType}}, nil
+}
+
+func (b *JSONType) ArrayType() reflect.Type { return reflect.TypeOf(JSONArray{}) }
+
+func (b *JSONType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) {
+	if !(data == "" || data == "{}") {
+		return nil, fmt.Errorf("serialized metadata for JSON extension type must be '' or '{}', found: %s", data)
+	}
+	return NewJSONType(storageType)
+}
+
+func (b *JSONType) ExtensionEquals(other arrow.ExtensionType) bool {
+	return b.ExtensionName() == other.ExtensionName() && arrow.TypeEqual(b.Storage, other.StorageType())
+}
+
+func (b *JSONType) ExtensionName() string { return "arrow.json" }
+
+func (b *JSONType) Serialize() string { return "" }
+
+func (b *JSONType) String() string {
+	return fmt.Sprintf("extension<%s[storage_type=%s]>", b.ExtensionName(), b.Storage)
+}
+
+// JSONArray is logically an array of UTF-8 encoded JSON strings.
+// Its values are unmarshaled to native Go values.
+type JSONArray struct {
+	array.ExtensionArrayBase
+}
+
+func (a *JSONArray) String() string {
+	b, err := a.MarshalJSON()
+	if err != nil {
+		panic(fmt.Sprintf("failed marshal JSONArray: %s", err))
+	}
+
+	return string(b)
+}
+
+func (a *JSONArray) Value(i int) any {
+	val := a.ValueBytes(i)
+
+	var res any
+	if err := json.Unmarshal(val, &res); err != nil {
+		panic(err)
+	}
+
+	return res
+}
+
+func (a *JSONArray) ValueStr(i int) string {
+	return string(a.ValueBytes(i))
+}
+
+func (a *JSONArray) ValueBytes(i int) []byte {
+	// convert to json.RawMessage, set to nil if elem isNull.
+	val := a.ValueJSON(i)
+
+	// simply returns wrapped bytes, or null if val is nil.
+	b, err := val.MarshalJSON()
+	if err != nil {
+		panic(err)
+	}
+
+	return b
+}
+
+// ValueJSON wraps the underlying string value as a json.RawMessage,
+// or returns nil if the array value is null.
+func (a *JSONArray) ValueJSON(i int) json.RawMessage {
+	var val json.RawMessage
+	if a.IsValid(i) {
+		val = json.RawMessage(a.Storage().(array.StringLike).Value(i))
+	}
+	return val
+}
+
+// MarshalJSON implements json.Marshaler.
+// Marshaling json.RawMessage is a no-op, except that nil values will
+// be marshaled as a JSON null.
+func (a *JSONArray) MarshalJSON() ([]byte, error) {
+	values := make([]json.RawMessage, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		values[i] = a.ValueJSON(i)
+	}
+	return json.Marshal(values)
+}
+
+// GetOneForMarshal implements arrow.Array.
+func (a *JSONArray) GetOneForMarshal(i int) interface{} {
+	return a.ValueJSON(i)
+}
+
+var (
+	_ arrow.ExtensionType  = (*JSONType)(nil)
+	_ array.ExtensionArray = (*JSONArray)(nil)
+)
diff --git a/go/arrow/extensions/json_test.go b/go/arrow/extensions/json_test.go
new file mode 100644
index 00000000000..21acc58f939
--- /dev/null
+++ b/go/arrow/extensions/json_test.go
@@ -0,0 +1,268 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions_test
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+
+	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
+	"github.com/apache/arrow/go/v18/arrow/ipc"
+	"github.com/apache/arrow/go/v18/arrow/memory"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestJSONTypeBasics(t *testing.T) {
+	typ, err := extensions.NewJSONType(arrow.BinaryTypes.String)
+	require.NoError(t, err)
+
+	typLarge, err := extensions.NewJSONType(arrow.BinaryTypes.LargeString)
+	require.NoError(t, err)
+
+	typView, err := extensions.NewJSONType(arrow.BinaryTypes.StringView)
+	require.NoError(t, err)
+
+	assert.Equal(t, "arrow.json", typ.ExtensionName())
+	assert.Equal(t, "arrow.json", typLarge.ExtensionName())
+	assert.Equal(t, "arrow.json", typView.ExtensionName())
+
+	assert.True(t, typ.ExtensionEquals(typ))
+	assert.True(t, typLarge.ExtensionEquals(typLarge))
+	assert.True(t, typView.ExtensionEquals(typView))
+
+	assert.False(t, arrow.TypeEqual(arrow.BinaryTypes.String, typ))
+	assert.False(t, arrow.TypeEqual(typ, typLarge))
+	assert.False(t, arrow.TypeEqual(typ, typView))
+	assert.False(t, arrow.TypeEqual(typLarge, typView))
+
+	assert.True(t, arrow.TypeEqual(arrow.BinaryTypes.String, typ.StorageType()))
+	assert.True(t, arrow.TypeEqual(arrow.BinaryTypes.LargeString, typLarge.StorageType()))
+	assert.True(t, arrow.TypeEqual(arrow.BinaryTypes.StringView, typView.StorageType()))
+
+	assert.Equal(t, "extension<arrow.json[storage_type=utf8]>", typ.String())
+	assert.Equal(t, "extension<arrow.json[storage_type=large_utf8]>", typLarge.String())
+	assert.Equal(t, "extension<arrow.json[storage_type=string_view]>", typView.String())
+}
+
+var jsonTestCases = []struct {
+	Name           string
+	StorageType    arrow.DataType
+	StorageBuilder func(mem memory.Allocator) array.Builder
+}{
+	{
+		Name:           "string",
+		StorageType:    arrow.BinaryTypes.String,
+		StorageBuilder: func(mem memory.Allocator) array.Builder { return array.NewStringBuilder(mem) },
+	},
+	{
+		Name:           "large_string",
+		StorageType:    arrow.BinaryTypes.LargeString,
+		StorageBuilder: func(mem memory.Allocator) array.Builder { return array.NewLargeStringBuilder(mem) },
+	},
+	{
+		Name:           "string_view",
+		StorageType:    arrow.BinaryTypes.StringView,
+		StorageBuilder: func(mem memory.Allocator) array.Builder { return array.NewStringViewBuilder(mem) },
+	},
+}
+
+func TestJSONTypeCreateFromArray(t *testing.T) {
+	for _, tc := range jsonTestCases {
+		t.Run(tc.Name, func(t *testing.T) {
+			typ, err := extensions.NewJSONType(tc.StorageType)
+			require.NoError(t, err)
+
+			bldr := tc.StorageBuilder(memory.DefaultAllocator)
+			defer bldr.Release()
+
+			bldr.AppendValueFromString(`"foobar"`)
+			bldr.AppendNull()
+			bldr.AppendValueFromString(`{"foo": "bar"}`)
+			bldr.AppendValueFromString(`42`)
+			bldr.AppendValueFromString(`true`)
+			bldr.AppendValueFromString(`[1, true, "3", null, {"five": 5}]`)
+
+			storage := bldr.NewArray()
+			defer storage.Release()
+
+			arr := array.NewExtensionArrayWithStorage(typ, storage)
+			defer arr.Release()
+
+			assert.Equal(t, 6, arr.Len())
+			assert.Equal(t, 1, arr.NullN())
+
+			jsonArr, ok := arr.(*extensions.JSONArray)
+			require.True(t, ok)
+
+			require.Equal(t, "foobar", jsonArr.Value(0))
+			require.Equal(t, nil, jsonArr.Value(1))
+			require.Equal(t, map[string]any{"foo": "bar"}, jsonArr.Value(2))
+			require.Equal(t, float64(42), jsonArr.Value(3))
+			require.Equal(t, true, jsonArr.Value(4))
+			require.Equal(t, []any{float64(1), true, "3", nil, map[string]any{"five": float64(5)}}, jsonArr.Value(5))
+		})
+	}
+}
+
+func TestJSONTypeBatchIPCRoundTrip(t *testing.T) {
+	for _, tc := range jsonTestCases {
+		t.Run(tc.Name, func(t *testing.T) {
+			typ, err := extensions.NewJSONType(tc.StorageType)
+			require.NoError(t, err)
+
+			bldr := tc.StorageBuilder(memory.DefaultAllocator)
+			defer bldr.Release()
+
+			bldr.AppendValueFromString(`"foobar"`)
+			bldr.AppendNull()
+			bldr.AppendValueFromString(`{"foo": "bar"}`)
+			bldr.AppendValueFromString(`42`)
+			bldr.AppendValueFromString(`true`)
+			bldr.AppendValueFromString(`[1, true, "3", null, {"five": 5}]`)
+
+			storage := bldr.NewArray()
+			defer storage.Release()
+
+			arr := array.NewExtensionArrayWithStorage(typ, storage)
+			defer arr.Release()
+
+			batch := array.NewRecord(arrow.NewSchema([]arrow.Field{{Name: "field", Type: typ, Nullable: true}}, nil),
+				[]arrow.Array{arr}, -1)
+			defer batch.Release()
+
+			var written arrow.Record
+			{
+				var buf bytes.Buffer
+				wr := ipc.NewWriter(&buf, ipc.WithSchema(batch.Schema()))
+				require.NoError(t, wr.Write(batch))
+				require.NoError(t, wr.Close())
+
+				rdr, err := ipc.NewReader(&buf)
+				require.NoError(t, err)
+				written, err = rdr.Read()
+				require.NoError(t, err)
+				written.Retain()
+				defer written.Release()
+				rdr.Release()
+			}
+
+			assert.Truef(t, batch.Schema().Equal(written.Schema()), "expected: %s, got: %s",
+				batch.Schema(), written.Schema())
+
+			assert.Truef(t, array.RecordEqual(batch, written), "expected: %s, got: %s",
+				batch, written)
+		})
+	}
+}
+
+func TestMarshallJSONArray(t *testing.T) {
+	for _, tc := range jsonTestCases {
+		t.Run(tc.Name, func(t *testing.T) {
+			typ, err := extensions.NewJSONType(tc.StorageType)
+			require.NoError(t, err)
+
+			bldr := tc.StorageBuilder(memory.DefaultAllocator)
+			defer bldr.Release()
+
+			bldr.AppendValueFromString(`"foobar"`)
+			bldr.AppendNull()
+			bldr.AppendValueFromString(`{"foo": "bar"}`)
+			bldr.AppendValueFromString(`42`)
+			bldr.AppendValueFromString(`true`)
+			bldr.AppendValueFromString(`[1, true, "3", null, {"five": 5}]`)
+
+			storage := bldr.NewArray()
+			defer storage.Release()
+
+			arr := array.NewExtensionArrayWithStorage(typ, storage)
+			defer arr.Release()
+
+			assert.Equal(t, 6, arr.Len())
+			assert.Equal(t, 1, arr.NullN())
+
+			jsonArr, ok := arr.(*extensions.JSONArray)
+			require.True(t, ok)
+
+			b, err := jsonArr.MarshalJSON()
+			require.NoError(t, err)
+
+			expectedJSON := `["foobar",null,{"foo":"bar"},42,true,[1,true,"3",null,{"five":5}]]`
+			require.Equal(t, expectedJSON, string(b))
+			require.Equal(t, expectedJSON, jsonArr.String())
+		})
+	}
+}
+
+func TestJSONRecordToJSON(t *testing.T) {
+	for _, tc := range jsonTestCases {
+		t.Run(tc.Name, func(t *testing.T) {
+			typ, err := extensions.NewJSONType(tc.StorageType)
+			require.NoError(t, err)
+
+			bldr := tc.StorageBuilder(memory.DefaultAllocator)
+			defer bldr.Release()
+
+			bldr.AppendValueFromString(`"foobar"`)
+			bldr.AppendNull()
+			bldr.AppendValueFromString(`{"foo": "bar"}`)
+			bldr.AppendValueFromString(`42`)
+			bldr.AppendValueFromString(`true`)
+			bldr.AppendValueFromString(`[1, true, "3", null, {"five": 5}]`)
+
+			storage := bldr.NewArray()
+			defer storage.Release()
+
+			arr := array.NewExtensionArrayWithStorage(typ, storage)
+			defer arr.Release()
+
+			assert.Equal(t, 6, arr.Len())
+			assert.Equal(t, 1, arr.NullN())
+
+			jsonArr, ok := arr.(*extensions.JSONArray)
+			require.True(t, ok)
+
+			rec := array.NewRecord(arrow.NewSchema([]arrow.Field{{Name: "json", Type: typ, Nullable: true}}, nil), []arrow.Array{jsonArr}, 6)
+			defer rec.Release()
+
+			buf := bytes.NewBuffer([]byte("\n")) // expected output has leading newline for clearer formatting
+			require.NoError(t, array.RecordToJSON(rec, buf))
+
+			expectedJSON := `
+				{"json":"foobar"}
+				{"json":null}
+				{"json":{"foo":"bar"}}
+				{"json":42}
+				{"json":true}
+				{"json":[1,true,"3",null,{"five":5}]}
+			`
+
+			expectedJSONLines := strings.Split(expectedJSON, "\n")
+			actualJSONLines := strings.Split(buf.String(), "\n")
+
+			require.Equal(t, len(expectedJSONLines), len(actualJSONLines))
+			for i := range expectedJSONLines {
+				if strings.TrimSpace(expectedJSONLines[i]) != "" {
+					require.JSONEq(t, expectedJSONLines[i], actualJSONLines[i])
+				}
+			}
+		})
+	}
+}
diff --git a/go/arrow/extensions/opaque_test.go b/go/arrow/extensions/opaque_test.go
index b6686e97bc0..a0fc8962ce5 100644
--- a/go/arrow/extensions/opaque_test.go
+++ b/go/arrow/extensions/opaque_test.go
@@ -161,9 +161,6 @@ func TestOpaqueTypeMetadataRoundTrip(t *testing.T) {
 
 func TestOpaqueTypeBatchRoundTrip(t *testing.T) {
 	typ := extensions.NewOpaqueType(arrow.BinaryTypes.String, "geometry", "adbc.postgresql")
-	arrow.RegisterExtensionType(typ)
-	defer arrow.UnregisterExtensionType(typ.ExtensionName())
-
 	storage, _, err := array.FromJSON(memory.DefaultAllocator, arrow.BinaryTypes.String,
 		strings.NewReader(`["foobar", null]`))
 	require.NoError(t, err)
diff --git a/go/arrow/extensions/uuid.go b/go/arrow/extensions/uuid.go
new file mode 100644
index 00000000000..422b9ea1188
--- /dev/null
+++ b/go/arrow/extensions/uuid.go
@@ -0,0 +1,265 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions
+
+import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"strings"
+
+	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/memory"
+	"github.com/apache/arrow/go/v18/internal/json"
+	"github.com/apache/arrow/go/v18/parquet/schema"
+	"github.com/google/uuid"
+)
+
+type UUIDBuilder struct {
+	*array.ExtensionBuilder
+}
+
+// NewUUIDBuilder creates a new UUIDBuilder, exposing a convenient and efficient interface
+// for writing uuid.UUID (or [16]byte) values to the underlying FixedSizeBinary storage array.
+func NewUUIDBuilder(mem memory.Allocator) *UUIDBuilder {
+	return &UUIDBuilder{ExtensionBuilder: array.NewExtensionBuilder(mem, NewUUIDType())}
+}
+
+func (b *UUIDBuilder) Append(v uuid.UUID) {
+	b.AppendBytes(v)
+}
+
+func (b *UUIDBuilder) AppendBytes(v [16]byte) {
+	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).Append(v[:])
+}
+
+func (b *UUIDBuilder) UnsafeAppend(v uuid.UUID) {
+	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).UnsafeAppend(v[:])
+}
+
+func (b *UUIDBuilder) AppendValueFromString(s string) error {
+	if s == array.NullValueStr {
+		b.AppendNull()
+		return nil
+	}
+
+	uid, err := uuid.Parse(s)
+	if err != nil {
+		return err
+	}
+
+	b.Append(uid)
+	return nil
+}
+
+func (b *UUIDBuilder) AppendValues(v []uuid.UUID, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	data := make([][]byte, len(v))
+	for i := range v {
+		if len(valid) > 0 && !valid[i] {
+			continue
+		}
+		data[i] = v[i][:]
+	}
+	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).AppendValues(data, valid)
+}
+
+func (b *UUIDBuilder) UnmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	var val uuid.UUID
+	switch v := t.(type) {
+	case string:
+		val, err = uuid.Parse(v)
+		if err != nil {
+			return err
+		}
+	case []byte:
+		val, err = uuid.ParseBytes(v)
+		if err != nil {
+			return err
+		}
+	case nil:
+		b.AppendNull()
+		return nil
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf([]byte{}),
+			Offset: dec.InputOffset(),
+			Struct: fmt.Sprintf("FixedSizeBinary[%d]", 16),
+		}
+	}
+
+	b.Append(val)
+	return nil
+}
+
+func (b *UUIDBuilder) Unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.UnmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *UUIDBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("uuid builder must unpack from json array, found %s", delim)
+	}
+
+	return b.Unmarshal(dec)
+}
+
+// UUIDArray is a simple array which is a FixedSizeBinary(16)
+type UUIDArray struct {
+	array.ExtensionArrayBase
+}
+
+func (a *UUIDArray) String() string {
+	arr := a.Storage().(*array.FixedSizeBinary)
+	o := new(strings.Builder)
+	o.WriteString("[")
+	for i := 0; i < arr.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(array.NullValueStr)
+		default:
+			fmt.Fprintf(o, "%q", a.Value(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+func (a *UUIDArray) Value(i int) uuid.UUID {
+	if a.IsNull(i) {
+		return uuid.Nil
+	}
+	return uuid.Must(uuid.FromBytes(a.Storage().(*array.FixedSizeBinary).Value(i)))
+}
+
+func (a *UUIDArray) Values() []uuid.UUID {
+	values := make([]uuid.UUID, a.Len())
+	for i := range values {
+		values[i] = a.Value(i)
+	}
+	return values
+}
+
+func (a *UUIDArray) ValueStr(i int) string {
+	switch {
+	case a.IsNull(i):
+		return array.NullValueStr
+	default:
+		return a.Value(i).String()
+	}
+}
+
+func (a *UUIDArray) MarshalJSON() ([]byte, error) {
+	vals := make([]any, a.Len())
+	for i := range vals {
+		vals[i] = a.GetOneForMarshal(i)
+	}
+	return json.Marshal(vals)
+}
+
+func (a *UUIDArray) GetOneForMarshal(i int) interface{} {
+	if a.IsValid(i) {
+		return a.Value(i)
+	}
+	return nil
+}
+
+// UUIDType is a simple extension type that represents a FixedSizeBinary(16)
+// to be used for representing UUIDs
+type UUIDType struct {
+	arrow.ExtensionBase
+}
+
+// ParquetLogicalType implements pqarrow.ExtensionCustomParquetType.
+func (e *UUIDType) ParquetLogicalType() schema.LogicalType {
+	return schema.UUIDLogicalType{}
+}
+
+// NewUUIDType is a convenience function to create an instance of UUIDType
+// with the correct storage type
+func NewUUIDType() *UUIDType {
+	return &UUIDType{ExtensionBase: arrow.ExtensionBase{Storage: &arrow.FixedSizeBinaryType{ByteWidth: 16}}}
+}
+
+// ArrayType returns TypeOf(UUIDArray{}) for constructing UUID arrays
+func (*UUIDType) ArrayType() reflect.Type {
+	return reflect.TypeOf(UUIDArray{})
+}
+
+func (*UUIDType) ExtensionName() string {
+	return "arrow.uuid"
+}
+
+func (e *UUIDType) String() string {
+	return fmt.Sprintf("extension<%s>", e.ExtensionName())
+}
+
+func (e *UUIDType) MarshalJSON() ([]byte, error) {
+	return []byte(fmt.Sprintf(`{"name":"%s","metadata":%s}`, e.ExtensionName(), e.Serialize())), nil
+}
+
+func (*UUIDType) Serialize() string {
+	return ""
+}
+
+// Deserialize expects storageType to be FixedSizeBinaryType{ByteWidth: 16}
+func (*UUIDType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) {
+	if !arrow.TypeEqual(storageType, &arrow.FixedSizeBinaryType{ByteWidth: 16}) {
+		return nil, fmt.Errorf("invalid storage type for UUIDType: %s", storageType.Name())
+	}
+	return NewUUIDType(), nil
+}
+
+// ExtensionEquals returns true if both extensions have the same name
+func (e *UUIDType) ExtensionEquals(other arrow.ExtensionType) bool {
+	return e.ExtensionName() == other.ExtensionName()
+}
+
+func (*UUIDType) NewBuilder(mem memory.Allocator) array.Builder {
+	return NewUUIDBuilder(mem)
+}
+
+var (
+	_ arrow.ExtensionType          = (*UUIDType)(nil)
+	_ array.CustomExtensionBuilder = (*UUIDType)(nil)
+	_ array.ExtensionArray         = (*UUIDArray)(nil)
+	_ array.Builder                = (*UUIDBuilder)(nil)
+)
diff --git a/go/arrow/extensions/uuid_test.go b/go/arrow/extensions/uuid_test.go
new file mode 100644
index 00000000000..80c621db2a0
--- /dev/null
+++ b/go/arrow/extensions/uuid_test.go
@@ -0,0 +1,257 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions_test
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
+	"github.com/apache/arrow/go/v18/arrow/ipc"
+	"github.com/apache/arrow/go/v18/arrow/memory"
+	"github.com/apache/arrow/go/v18/internal/json"
+	"github.com/google/uuid"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+var testUUID = uuid.New()
+
+func TestUUIDExtensionBuilder(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(t, 0)
+	builder := extensions.NewUUIDBuilder(mem)
+	builder.Append(testUUID)
+	builder.AppendNull()
+	builder.AppendBytes(testUUID)
+	arr := builder.NewArray()
+	defer arr.Release()
+	arrStr := arr.String()
+	assert.Equal(t, fmt.Sprintf(`["%[1]s" (null) "%[1]s"]`, testUUID), arrStr)
+	jsonStr, err := json.Marshal(arr)
+	assert.NoError(t, err)
+
+	arr1, _, err := array.FromJSON(mem, extensions.NewUUIDType(), bytes.NewReader(jsonStr))
+	defer arr1.Release()
+	assert.NoError(t, err)
+	assert.True(t, array.Equal(arr1, arr))
+
+	require.NoError(t, json.Unmarshal(jsonStr, builder))
+	arr2 := builder.NewArray()
+	defer arr2.Release()
+	assert.True(t, array.Equal(arr2, arr))
+}
+
+func TestUUIDExtensionRecordBuilder(t *testing.T) {
+	schema := arrow.NewSchema([]arrow.Field{
+		{Name: "uuid", Type: extensions.NewUUIDType()},
+	}, nil)
+	builder := array.NewRecordBuilder(memory.DefaultAllocator, schema)
+	builder.Field(0).(*extensions.UUIDBuilder).Append(testUUID)
+	builder.Field(0).(*extensions.UUIDBuilder).AppendNull()
+	builder.Field(0).(*extensions.UUIDBuilder).Append(testUUID)
+	record := builder.NewRecord()
+	b, err := record.MarshalJSON()
+	require.NoError(t, err)
+	require.Equal(t, "[{\"uuid\":\""+testUUID.String()+"\"}\n,{\"uuid\":null}\n,{\"uuid\":\""+testUUID.String()+"\"}\n]", string(b))
+	record1, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(b))
+	require.NoError(t, err)
+	require.Equal(t, record, record1)
+}
+
+func TestUUIDStringRoundTrip(t *testing.T) {
+	// 1. create array
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(t, 0)
+
+	b := extensions.NewUUIDBuilder(mem)
+	b.Append(uuid.Nil)
+	b.AppendNull()
+	b.Append(uuid.NameSpaceURL)
+	b.AppendNull()
+	b.Append(testUUID)
+
+	arr := b.NewArray()
+	defer arr.Release()
+
+	// 2. create array via AppendValueFromString
+	b1 := extensions.NewUUIDBuilder(mem)
+	defer b1.Release()
+
+	for i := 0; i < arr.Len(); i++ {
+		assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i)))
+	}
+
+	arr1 := b1.NewArray()
+	defer arr1.Release()
+
+	assert.True(t, array.Equal(arr, arr1))
+}
+
+func TestUUIDTypeBasics(t *testing.T) {
+	typ := extensions.NewUUIDType()
+
+	assert.Equal(t, "arrow.uuid", typ.ExtensionName())
+	assert.True(t, typ.ExtensionEquals(typ))
+
+	assert.True(t, arrow.TypeEqual(typ, typ))
+	assert.False(t, arrow.TypeEqual(&arrow.FixedSizeBinaryType{ByteWidth: 16}, typ))
+	assert.True(t, arrow.TypeEqual(&arrow.FixedSizeBinaryType{ByteWidth: 16}, typ.StorageType()))
+
+	assert.Equal(t, "extension<arrow.uuid>", typ.String())
+}
+
+func TestUUIDTypeCreateFromArray(t *testing.T) {
+	typ := extensions.NewUUIDType()
+
+	bldr := array.NewFixedSizeBinaryBuilder(memory.DefaultAllocator, &arrow.FixedSizeBinaryType{ByteWidth: 16})
+	defer bldr.Release()
+
+	bldr.Append(testUUID[:])
+	bldr.AppendNull()
+	bldr.Append(testUUID[:])
+
+	storage := bldr.NewArray()
+	defer storage.Release()
+
+	arr := array.NewExtensionArrayWithStorage(typ, storage)
+	defer arr.Release()
+
+	assert.Equal(t, 3, arr.Len())
+	assert.Equal(t, 1, arr.NullN())
+
+	uuidArr, ok := arr.(*extensions.UUIDArray)
+	require.True(t, ok)
+
+	require.Equal(t, testUUID, uuidArr.Value(0))
+	require.Equal(t, uuid.Nil, uuidArr.Value(1))
+	require.Equal(t, testUUID, uuidArr.Value(2))
+}
+
+func TestUUIDTypeBatchIPCRoundTrip(t *testing.T) {
+	typ := extensions.NewUUIDType()
+
+	bldr := extensions.NewUUIDBuilder(memory.DefaultAllocator)
+	defer bldr.Release()
+
+	bldr.Append(testUUID)
+	bldr.AppendNull()
+	bldr.AppendBytes(testUUID)
+
+	arr := bldr.NewArray()
+	defer arr.Release()
+
+	batch := array.NewRecord(arrow.NewSchema([]arrow.Field{{Name: "field", Type: typ, Nullable: true}}, nil),
+		[]arrow.Array{arr}, -1)
+	defer batch.Release()
+
+	var written arrow.Record
+	{
+		var buf bytes.Buffer
+		wr := ipc.NewWriter(&buf, ipc.WithSchema(batch.Schema()))
+		require.NoError(t, wr.Write(batch))
+		require.NoError(t, wr.Close())
+
+		rdr, err := ipc.NewReader(&buf)
+		require.NoError(t, err)
+		written, err = rdr.Read()
+		require.NoError(t, err)
+		written.Retain()
+		defer written.Release()
+		rdr.Release()
+	}
+
+	assert.Truef(t, batch.Schema().Equal(written.Schema()), "expected: %s, got: %s",
+		batch.Schema(), written.Schema())
+
+	assert.Truef(t, array.RecordEqual(batch, written), "expected: %s, got: %s",
+		batch, written)
+}
+
+func TestMarshallUUIDArray(t *testing.T) {
+	bldr := extensions.NewUUIDBuilder(memory.DefaultAllocator)
+	defer bldr.Release()
+
+	bldr.Append(testUUID)
+	bldr.AppendNull()
+	bldr.AppendBytes(testUUID)
+
+	arr := bldr.NewArray()
+	defer arr.Release()
+
+	assert.Equal(t, 3, arr.Len())
+	assert.Equal(t, 1, arr.NullN())
+
+	uuidArr, ok := arr.(*extensions.UUIDArray)
+	require.True(t, ok)
+
+	b, err := uuidArr.MarshalJSON()
+	require.NoError(t, err)
+
+	expectedJSON := fmt.Sprintf(`["%[1]s",null,"%[1]s"]`, testUUID)
+	require.Equal(t, expectedJSON, string(b))
+}
+
+func TestUUIDRecordToJSON(t *testing.T) {
+	typ := extensions.NewUUIDType()
+
+	bldr := extensions.NewUUIDBuilder(memory.DefaultAllocator)
+	defer bldr.Release()
+
+	uuid1 := uuid.MustParse("8c607ed4-07b2-4b9c-b5eb-c0387357f9ae")
+
+	bldr.Append(uuid1)
+	bldr.AppendNull()
+
+	// c5f2cbd9-7094-491a-b267-167bb62efe02
+	bldr.AppendBytes([16]byte{197, 242, 203, 217, 112, 148, 73, 26, 178, 103, 22, 123, 182, 46, 254, 2})
+
+	arr := bldr.NewArray()
+	defer arr.Release()
+
+	assert.Equal(t, 3, arr.Len())
+	assert.Equal(t, 1, arr.NullN())
+
+	uuidArr, ok := arr.(*extensions.UUIDArray)
+	require.True(t, ok)
+
+	rec := array.NewRecord(arrow.NewSchema([]arrow.Field{{Name: "uuid", Type: typ, Nullable: true}}, nil), []arrow.Array{uuidArr}, 3)
+	defer rec.Release()
+
+	buf := bytes.NewBuffer([]byte("\n")) // expected output has leading newline for clearer formatting
+	require.NoError(t, array.RecordToJSON(rec, buf))
+
+	expectedJSON := `
+		{"uuid":"8c607ed4-07b2-4b9c-b5eb-c0387357f9ae"}
+		{"uuid":null}
+		{"uuid":"c5f2cbd9-7094-491a-b267-167bb62efe02"}
+	`
+
+	expectedJSONLines := strings.Split(expectedJSON, "\n")
+	actualJSONLines := strings.Split(buf.String(), "\n")
+
+	require.Equal(t, len(expectedJSONLines), len(actualJSONLines))
+	for i := range expectedJSONLines {
+		if strings.TrimSpace(expectedJSONLines[i]) != "" {
+			require.JSONEq(t, expectedJSONLines[i], actualJSONLines[i])
+		}
+	}
+}
diff --git a/go/arrow/internal/flight_integration/scenario.go b/go/arrow/internal/flight_integration/scenario.go
index 1528bb05d9d..b9535002a0a 100644
--- a/go/arrow/internal/flight_integration/scenario.go
+++ b/go/arrow/internal/flight_integration/scenario.go
@@ -40,7 +40,6 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/internal/arrjson"
 	"github.com/apache/arrow/go/v18/arrow/ipc"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"golang.org/x/xerrors"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/codes"
@@ -161,9 +160,6 @@ func (s *defaultIntegrationTester) RunClient(addr string, opts ...grpc.DialOptio
 
 	ctx := context.Background()
 
-	arrow.RegisterExtensionType(types.NewUUIDType())
-	defer arrow.UnregisterExtensionType("uuid")
-
 	descr := &flight.FlightDescriptor{
 		Type: flight.DescriptorPATH,
 		Path: []string{s.path},
diff --git a/go/arrow/ipc/cmd/arrow-json-integration-test/main.go b/go/arrow/ipc/cmd/arrow-json-integration-test/main.go
index b3e1dcac141..c47a091268b 100644
--- a/go/arrow/ipc/cmd/arrow-json-integration-test/main.go
+++ b/go/arrow/ipc/cmd/arrow-json-integration-test/main.go
@@ -22,12 +22,10 @@ import (
 	"log"
 	"os"
 
-	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
 	"github.com/apache/arrow/go/v18/arrow/arrio"
 	"github.com/apache/arrow/go/v18/arrow/internal/arrjson"
 	"github.com/apache/arrow/go/v18/arrow/ipc"
-	"github.com/apache/arrow/go/v18/internal/types"
 )
 
 func main() {
@@ -50,8 +48,6 @@ func main() {
 }
 
 func runCommand(jsonName, arrowName, mode string, verbose bool) error {
-	arrow.RegisterExtensionType(types.NewUUIDType())
-
 	if jsonName == "" {
 		return fmt.Errorf("must specify json file name")
 	}
diff --git a/go/arrow/ipc/metadata_test.go b/go/arrow/ipc/metadata_test.go
index 33bc63c2a00..14b8da2cf7c 100644
--- a/go/arrow/ipc/metadata_test.go
+++ b/go/arrow/ipc/metadata_test.go
@@ -23,10 +23,10 @@ import (
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/internal/dictutils"
 	"github.com/apache/arrow/go/v18/arrow/internal/flatbuf"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	flatbuffers "github.com/google/flatbuffers/go"
 	"github.com/stretchr/testify/assert"
 )
@@ -169,7 +169,7 @@ func TestRWFooter(t *testing.T) {
 }
 
 func exampleUUID(mem memory.Allocator) arrow.Array {
-	extType := types.NewUUIDType()
+	extType := extensions.NewUUIDType()
 	bldr := array.NewExtensionBuilder(mem, extType)
 	defer bldr.Release()
 
@@ -184,9 +184,6 @@ func TestUnrecognizedExtensionType(t *testing.T) {
 	pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
 	defer pool.AssertSize(t, 0)
 
-	// register the uuid type
-	assert.NoError(t, arrow.RegisterExtensionType(types.NewUUIDType()))
-
 	extArr := exampleUUID(pool)
 	defer extArr.Release()
 
@@ -205,7 +202,9 @@ func TestUnrecognizedExtensionType(t *testing.T) {
 
 	// unregister the uuid type before we read back the buffer so it is
 	// unrecognized when reading back the record batch.
-	assert.NoError(t, arrow.UnregisterExtensionType("uuid"))
+	assert.NoError(t, arrow.UnregisterExtensionType("arrow.uuid"))
+	// re-register once the test is complete
+	defer arrow.RegisterExtensionType(extensions.NewUUIDType())
 	rdr, err := NewReader(&buf, WithAllocator(pool))
 	defer rdr.Release()
 
diff --git a/go/internal/types/extension_types.go b/go/internal/types/extension_types.go
index 85c64d86bff..33ada2d488f 100644
--- a/go/internal/types/extension_types.go
+++ b/go/internal/types/extension_types.go
@@ -18,238 +18,15 @@
 package types
 
 import (
-	"bytes"
 	"encoding/binary"
 	"fmt"
 	"reflect"
-	"strings"
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
-	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/json"
-	"github.com/google/uuid"
 	"golang.org/x/xerrors"
 )
 
-var UUID = NewUUIDType()
-
-type UUIDBuilder struct {
-	*array.ExtensionBuilder
-}
-
-func NewUUIDBuilder(mem memory.Allocator) *UUIDBuilder {
-	return &UUIDBuilder{ExtensionBuilder: array.NewExtensionBuilder(mem, NewUUIDType())}
-}
-
-func (b *UUIDBuilder) Append(v uuid.UUID) {
-	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).Append(v[:])
-}
-
-func (b *UUIDBuilder) UnsafeAppend(v uuid.UUID) {
-	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).UnsafeAppend(v[:])
-}
-
-func (b *UUIDBuilder) AppendValueFromString(s string) error {
-	if s == array.NullValueStr {
-		b.AppendNull()
-		return nil
-	}
-
-	uid, err := uuid.Parse(s)
-	if err != nil {
-		return err
-	}
-
-	b.Append(uid)
-	return nil
-}
-
-func (b *UUIDBuilder) AppendValues(v []uuid.UUID, valid []bool) {
-	if len(v) != len(valid) && len(valid) != 0 {
-		panic("len(v) != len(valid) && len(valid) != 0")
-	}
-
-	data := make([][]byte, len(v))
-	for i := range v {
-		if len(valid) > 0 && !valid[i] {
-			continue
-		}
-		data[i] = v[i][:]
-	}
-	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).AppendValues(data, valid)
-}
-
-func (b *UUIDBuilder) UnmarshalOne(dec *json.Decoder) error {
-	t, err := dec.Token()
-	if err != nil {
-		return err
-	}
-
-	var val uuid.UUID
-	switch v := t.(type) {
-	case string:
-		val, err = uuid.Parse(v)
-		if err != nil {
-			return err
-		}
-	case []byte:
-		val, err = uuid.ParseBytes(v)
-		if err != nil {
-			return err
-		}
-	case nil:
-		b.AppendNull()
-		return nil
-	default:
-		return &json.UnmarshalTypeError{
-			Value:  fmt.Sprint(t),
-			Type:   reflect.TypeOf([]byte{}),
-			Offset: dec.InputOffset(),
-			Struct: fmt.Sprintf("FixedSizeBinary[%d]", 16),
-		}
-	}
-
-	b.Append(val)
-	return nil
-}
-
-func (b *UUIDBuilder) Unmarshal(dec *json.Decoder) error {
-	for dec.More() {
-		if err := b.UnmarshalOne(dec); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func (b *UUIDBuilder) UnmarshalJSON(data []byte) error {
-	dec := json.NewDecoder(bytes.NewReader(data))
-	t, err := dec.Token()
-	if err != nil {
-		return err
-	}
-
-	if delim, ok := t.(json.Delim); !ok || delim != '[' {
-		return fmt.Errorf("uuid builder must unpack from json array, found %s", delim)
-	}
-
-	return b.Unmarshal(dec)
-}
-
-// UUIDArray is a simple array which is a FixedSizeBinary(16)
-type UUIDArray struct {
-	array.ExtensionArrayBase
-}
-
-func (a *UUIDArray) String() string {
-	arr := a.Storage().(*array.FixedSizeBinary)
-	o := new(strings.Builder)
-	o.WriteString("[")
-	for i := 0; i < arr.Len(); i++ {
-		if i > 0 {
-			o.WriteString(" ")
-		}
-		switch {
-		case a.IsNull(i):
-			o.WriteString(array.NullValueStr)
-		default:
-			fmt.Fprintf(o, "%q", a.Value(i))
-		}
-	}
-	o.WriteString("]")
-	return o.String()
-}
-
-func (a *UUIDArray) Value(i int) uuid.UUID {
-	if a.IsNull(i) {
-		return uuid.Nil
-	}
-	return uuid.Must(uuid.FromBytes(a.Storage().(*array.FixedSizeBinary).Value(i)))
-}
-
-func (a *UUIDArray) ValueStr(i int) string {
-	switch {
-	case a.IsNull(i):
-		return array.NullValueStr
-	default:
-		return a.Value(i).String()
-	}
-}
-
-func (a *UUIDArray) MarshalJSON() ([]byte, error) {
-	arr := a.Storage().(*array.FixedSizeBinary)
-	values := make([]interface{}, a.Len())
-	for i := 0; i < a.Len(); i++ {
-		if a.IsValid(i) {
-			values[i] = uuid.Must(uuid.FromBytes(arr.Value(i))).String()
-		}
-	}
-	return json.Marshal(values)
-}
-
-func (a *UUIDArray) GetOneForMarshal(i int) interface{} {
-	if a.IsNull(i) {
-		return nil
-	}
-	return a.Value(i)
-}
-
-// UUIDType is a simple extension type that represents a FixedSizeBinary(16)
-// to be used for representing UUIDs
-type UUIDType struct {
-	arrow.ExtensionBase
-}
-
-// NewUUIDType is a convenience function to create an instance of UUIDType
-// with the correct storage type
-func NewUUIDType() *UUIDType {
-	return &UUIDType{ExtensionBase: arrow.ExtensionBase{Storage: &arrow.FixedSizeBinaryType{ByteWidth: 16}}}
-}
-
-// ArrayType returns TypeOf(UUIDArray{}) for constructing UUID arrays
-func (*UUIDType) ArrayType() reflect.Type {
-	return reflect.TypeOf(UUIDArray{})
-}
-
-func (*UUIDType) ExtensionName() string {
-	return "uuid"
-}
-
-func (e *UUIDType) String() string {
-	return fmt.Sprintf("extension_type<storage=%s>", e.Storage)
-}
-
-func (e *UUIDType) MarshalJSON() ([]byte, error) {
-	return []byte(fmt.Sprintf(`{"name":"%s","metadata":%s}`, e.ExtensionName(), e.Serialize())), nil
-}
-
-// Serialize returns "uuid-serialized" for testing proper metadata passing
-func (*UUIDType) Serialize() string {
-	return "uuid-serialized"
-}
-
-// Deserialize expects storageType to be FixedSizeBinaryType{ByteWidth: 16} and the data to be
-// "uuid-serialized" in order to correctly create a UUIDType for testing deserialize.
-func (*UUIDType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) {
-	if data != "uuid-serialized" {
-		return nil, fmt.Errorf("type identifier did not match: '%s'", data)
-	}
-	if !arrow.TypeEqual(storageType, &arrow.FixedSizeBinaryType{ByteWidth: 16}) {
-		return nil, fmt.Errorf("invalid storage type for UUIDType: %s", storageType.Name())
-	}
-	return NewUUIDType(), nil
-}
-
-// ExtensionEquals returns true if both extensions have the same name
-func (e *UUIDType) ExtensionEquals(other arrow.ExtensionType) bool {
-	return e.ExtensionName() == other.ExtensionName()
-}
-
-func (*UUIDType) NewBuilder(mem memory.Allocator) array.Builder {
-	return NewUUIDBuilder(mem)
-}
-
 // Parametric1Array is a simple int32 array for use with the Parametric1Type
 // in testing a parameterized user-defined extension type.
 type Parametric1Array struct {
@@ -518,14 +295,14 @@ func (SmallintType) ArrayType() reflect.Type { return reflect.TypeOf(SmallintArr
 
 func (SmallintType) ExtensionName() string { return "smallint" }
 
-func (SmallintType) Serialize() string { return "smallint" }
+func (SmallintType) Serialize() string { return "smallint-serialized" }
 
 func (s *SmallintType) ExtensionEquals(other arrow.ExtensionType) bool {
 	return s.Name() == other.Name()
 }
 
 func (SmallintType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) {
-	if data != "smallint" {
+	if data != "smallint-serialized" {
 		return nil, fmt.Errorf("type identifier did not match: '%s'", data)
 	}
 	if !arrow.TypeEqual(storageType, arrow.PrimitiveTypes.Int16) {
diff --git a/go/internal/types/extension_types_test.go b/go/internal/types/extension_types_test.go
deleted file mode 100644
index 65f6353d01b..00000000000
--- a/go/internal/types/extension_types_test.go
+++ /dev/null
@@ -1,95 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package types_test
-
-import (
-	"bytes"
-	"testing"
-
-	"github.com/apache/arrow/go/v18/arrow"
-	"github.com/apache/arrow/go/v18/arrow/array"
-	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/json"
-	"github.com/apache/arrow/go/v18/internal/types"
-	"github.com/google/uuid"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-var testUUID = uuid.New()
-
-func TestUUIDExtensionBuilder(t *testing.T) {
-	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
-	defer mem.AssertSize(t, 0)
-	builder := types.NewUUIDBuilder(mem)
-	builder.Append(testUUID)
-	arr := builder.NewArray()
-	defer arr.Release()
-	arrStr := arr.String()
-	assert.Equal(t, "[\""+testUUID.String()+"\"]", arrStr)
-	jsonStr, err := json.Marshal(arr)
-	assert.NoError(t, err)
-
-	arr1, _, err := array.FromJSON(mem, types.NewUUIDType(), bytes.NewReader(jsonStr))
-	defer arr1.Release()
-	assert.NoError(t, err)
-	assert.Equal(t, arr, arr1)
-}
-
-func TestUUIDExtensionRecordBuilder(t *testing.T) {
-	schema := arrow.NewSchema([]arrow.Field{
-		{Name: "uuid", Type: types.NewUUIDType()},
-	}, nil)
-	builder := array.NewRecordBuilder(memory.DefaultAllocator, schema)
-	builder.Field(0).(*types.UUIDBuilder).Append(testUUID)
-	record := builder.NewRecord()
-	b, err := record.MarshalJSON()
-	require.NoError(t, err)
-	require.Equal(t, "[{\"uuid\":\""+testUUID.String()+"\"}\n]", string(b))
-	record1, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(b))
-	require.NoError(t, err)
-	require.Equal(t, record, record1)
-}
-
-func TestUUIDStringRoundTrip(t *testing.T) {
-	// 1. create array
-	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
-	defer mem.AssertSize(t, 0)
-
-	b := types.NewUUIDBuilder(mem)
-	b.Append(uuid.Nil)
-	b.AppendNull()
-	b.Append(uuid.NameSpaceURL)
-	b.AppendNull()
-	b.Append(testUUID)
-
-	arr := b.NewArray()
-	defer arr.Release()
-
-	// 2. create array via AppendValueFromString
-	b1 := types.NewUUIDBuilder(mem)
-	defer b1.Release()
-
-	for i := 0; i < arr.Len(); i++ {
-		assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i)))
-	}
-
-	arr1 := b1.NewArray()
-	defer arr1.Release()
-
-	assert.True(t, array.Equal(arr, arr1))
-}
diff --git a/go/parquet/cmd/parquet_reader/main.go b/go/parquet/cmd/parquet_reader/main.go
index 6e04f4254f9..4e480aeb866 100644
--- a/go/parquet/cmd/parquet_reader/main.go
+++ b/go/parquet/cmd/parquet_reader/main.go
@@ -154,7 +154,7 @@ func main() {
 			if descr.ConvertedType() != schema.ConvertedTypes.None {
 				fmt.Printf("/%s", descr.ConvertedType())
 				if descr.ConvertedType() == schema.ConvertedTypes.Decimal {
-					dec := descr.LogicalType().(*schema.DecimalLogicalType)
+					dec := descr.LogicalType().(schema.DecimalLogicalType)
 					fmt.Printf("(%d,%d)", dec.Precision(), dec.Scale())
 				}
 			}
diff --git a/go/parquet/metadata/app_version.go b/go/parquet/metadata/app_version.go
index 887ed79343a..345e9d440a1 100644
--- a/go/parquet/metadata/app_version.go
+++ b/go/parquet/metadata/app_version.go
@@ -164,7 +164,7 @@ func (v AppVersion) HasCorrectStatistics(coltype parquet.Type, logicalType schem
 	// parquet-cpp-arrow version 4.0.0 fixed Decimal comparisons for creating min/max stats
 	// parquet-cpp also becomes parquet-cpp-arrow as of version 4.0.0
 	if v.App == "parquet-cpp" || (v.App == "parquet-cpp-arrow" && v.LessThan(parquet1655FixedVersion)) {
-		if _, ok := logicalType.(*schema.DecimalLogicalType); ok && coltype == parquet.Types.FixedLenByteArray {
+		if _, ok := logicalType.(schema.DecimalLogicalType); ok && coltype == parquet.Types.FixedLenByteArray {
 			return false
 		}
 	}
diff --git a/go/parquet/pqarrow/encode_arrow_test.go b/go/parquet/pqarrow/encode_arrow_test.go
index 16282173a68..a238a78133e 100644
--- a/go/parquet/pqarrow/encode_arrow_test.go
+++ b/go/parquet/pqarrow/encode_arrow_test.go
@@ -30,6 +30,7 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/bitutil"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/decimal256"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/ipc"
 	"github.com/apache/arrow/go/v18/arrow/memory"
 	"github.com/apache/arrow/go/v18/internal/types"
@@ -715,16 +716,6 @@ type ParquetIOTestSuite struct {
 	suite.Suite
 }
 
-func (ps *ParquetIOTestSuite) SetupTest() {
-	ps.NoError(arrow.RegisterExtensionType(types.NewUUIDType()))
-}
-
-func (ps *ParquetIOTestSuite) TearDownTest() {
-	if arrow.GetExtensionType("uuid") != nil {
-		ps.NoError(arrow.UnregisterExtensionType("uuid"))
-	}
-}
-
 func (ps *ParquetIOTestSuite) makeSimpleSchema(typ arrow.DataType, rep parquet.Repetition) *schema.GroupNode {
 	byteWidth := int32(-1)
 
@@ -2053,7 +2044,7 @@ func (ps *ParquetIOTestSuite) TestArrowExtensionTypeRoundTrip() {
 	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
 	defer mem.AssertSize(ps.T(), 0)
 
-	builder := types.NewUUIDBuilder(mem)
+	builder := extensions.NewUUIDBuilder(mem)
 	builder.Append(uuid.New())
 	arr := builder.NewArray()
 	defer arr.Release()
@@ -2076,22 +2067,23 @@ func (ps *ParquetIOTestSuite) TestArrowUnknownExtensionTypeRoundTrip() {
 
 	{
 		// Prepare `written` table with the extension type registered.
-		extType := types.NewUUIDType()
+		extType := types.NewSmallintType()
 		bldr := array.NewExtensionBuilder(mem, extType)
 		defer bldr.Release()
 
-		bldr.Builder.(*array.FixedSizeBinaryBuilder).AppendValues(
-			[][]byte{nil, []byte("abcdefghijklmno0"), []byte("abcdefghijklmno1"), []byte("abcdefghijklmno2")},
+		bldr.Builder.(*array.Int16Builder).AppendValues(
+			[]int16{0, 0, 1, 2},
 			[]bool{false, true, true, true})
 
 		arr := bldr.NewArray()
 		defer arr.Release()
 
-		if arrow.GetExtensionType("uuid") != nil {
-			ps.NoError(arrow.UnregisterExtensionType("uuid"))
+		if arrow.GetExtensionType("smallint") != nil {
+			ps.NoError(arrow.UnregisterExtensionType("smallint"))
+			defer arrow.RegisterExtensionType(extType)
 		}
 
-		fld := arrow.Field{Name: "uuid", Type: arr.DataType(), Nullable: true}
+		fld := arrow.Field{Name: "smallint", Type: arr.DataType(), Nullable: true}
 		cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr})
 		defer arr.Release() // NewChunked
 		written = array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1)
@@ -2101,16 +2093,16 @@ func (ps *ParquetIOTestSuite) TestArrowUnknownExtensionTypeRoundTrip() {
 
 	{
 		// Prepare `expected` table with the extension type unregistered in the underlying type.
-		bldr := array.NewFixedSizeBinaryBuilder(mem, &arrow.FixedSizeBinaryType{ByteWidth: 16})
+		bldr := array.NewInt16Builder(mem)
 		defer bldr.Release()
 		bldr.AppendValues(
-			[][]byte{nil, []byte("abcdefghijklmno0"), []byte("abcdefghijklmno1"), []byte("abcdefghijklmno2")},
+			[]int16{0, 0, 1, 2},
 			[]bool{false, true, true, true})
 
 		arr := bldr.NewArray()
 		defer arr.Release()
 
-		fld := arrow.Field{Name: "uuid", Type: arr.DataType(), Nullable: true}
+		fld := arrow.Field{Name: "smallint", Type: arr.DataType(), Nullable: true}
 		cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr})
 		defer arr.Release() // NewChunked
 		expected = array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1)
@@ -2147,13 +2139,55 @@ func (ps *ParquetIOTestSuite) TestArrowUnknownExtensionTypeRoundTrip() {
 	ps.Truef(array.Equal(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc)
 
 	expectedMd := arrow.MetadataFrom(map[string]string{
-		ipc.ExtensionTypeKeyName:     "uuid",
-		ipc.ExtensionMetadataKeyName: "uuid-serialized",
+		ipc.ExtensionTypeKeyName:     "smallint",
+		ipc.ExtensionMetadataKeyName: "smallint-serialized",
 		"PARQUET:field_id":           "-1",
 	})
 	ps.Truef(expectedMd.Equal(tbl.Column(0).Field().Metadata), "expected: %v\ngot: %v", expectedMd, tbl.Column(0).Field().Metadata)
 }
 
+func (ps *ParquetIOTestSuite) TestArrowExtensionTypeLogicalType() {
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(ps.T(), 0)
+
+	jsonType, err := extensions.NewJSONType(arrow.BinaryTypes.String)
+	ps.NoError(err)
+
+	sch := arrow.NewSchema([]arrow.Field{
+		{Name: "uuid", Type: extensions.NewUUIDType()},
+		{Name: "json", Type: jsonType},
+	},
+		nil,
+	)
+	bldr := array.NewRecordBuilder(mem, sch)
+	defer bldr.Release()
+
+	bldr.Field(0).(*extensions.UUIDBuilder).Append(uuid.New())
+	bldr.Field(1).(*array.ExtensionBuilder).AppendValueFromString(`{"hello": ["world", 2, true], "world": null}`)
+	rec := bldr.NewRecord()
+	defer rec.Release()
+
+	var buf bytes.Buffer
+	wr, err := pqarrow.NewFileWriter(
+		sch,
+		&buf,
+		parquet.NewWriterProperties(),
+		pqarrow.DefaultWriterProps(),
+	)
+	ps.Require().NoError(err)
+
+	ps.Require().NoError(wr.Write(rec))
+	ps.Require().NoError(wr.Close())
+
+	rdr, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
+	ps.Require().NoError(err)
+	defer rdr.Close()
+
+	pqSchema := rdr.MetaData().Schema
+	ps.True(pqSchema.Column(0).LogicalType().Equals(schema.UUIDLogicalType{}))
+	ps.True(pqSchema.Column(1).LogicalType().Equals(schema.JSONLogicalType{}))
+}
+
 func TestWriteTableMemoryAllocation(t *testing.T) {
 	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
 	sc := arrow.NewSchema([]arrow.Field{
@@ -2163,7 +2197,7 @@ func TestWriteTableMemoryAllocation(t *testing.T) {
 			arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
 			arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64, Nullable: true})},
 		{Name: "arr_i64", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64)},
-		{Name: "uuid", Type: types.NewUUIDType(), Nullable: true},
+		{Name: "uuid", Type: extensions.NewUUIDType(), Nullable: true},
 	}, nil)
 
 	bld := array.NewRecordBuilder(mem, sc)
@@ -2176,7 +2210,7 @@ func TestWriteTableMemoryAllocation(t *testing.T) {
 	abld := bld.Field(3).(*array.ListBuilder)
 	abld.Append(true)
 	abld.ValueBuilder().(*array.Int64Builder).Append(2)
-	bld.Field(4).(*types.UUIDBuilder).Append(uuid.MustParse("00000000-0000-0000-0000-000000000001"))
+	bld.Field(4).(*extensions.UUIDBuilder).Append(uuid.MustParse("00000000-0000-0000-0000-000000000001"))
 
 	rec := bld.NewRecord()
 	bld.Release()
diff --git a/go/parquet/pqarrow/path_builder_test.go b/go/parquet/pqarrow/path_builder_test.go
index 9bbae426b8a..364f836d0bb 100644
--- a/go/parquet/pqarrow/path_builder_test.go
+++ b/go/parquet/pqarrow/path_builder_test.go
@@ -22,8 +22,8 @@ import (
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"github.com/google/uuid"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -364,12 +364,12 @@ func TestNestedExtensionListsWithSomeNulls(t *testing.T) {
 	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
 	defer mem.AssertSize(t, 0)
 
-	listType := arrow.ListOf(types.NewUUIDType())
+	listType := arrow.ListOf(extensions.NewUUIDType())
 	bldr := array.NewListBuilder(mem, listType)
 	defer bldr.Release()
 
 	nestedBldr := bldr.ValueBuilder().(*array.ListBuilder)
-	vb := nestedBldr.ValueBuilder().(*types.UUIDBuilder)
+	vb := nestedBldr.ValueBuilder().(*extensions.UUIDBuilder)
 
 	uuid1 := uuid.New()
 	uuid3 := uuid.New()
diff --git a/go/parquet/pqarrow/schema.go b/go/parquet/pqarrow/schema.go
index ce5cc6f9050..4882077671f 100644
--- a/go/parquet/pqarrow/schema.go
+++ b/go/parquet/pqarrow/schema.go
@@ -25,7 +25,6 @@ import (
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/flight"
-	"github.com/apache/arrow/go/v18/arrow/ipc"
 	"github.com/apache/arrow/go/v18/arrow/memory"
 	"github.com/apache/arrow/go/v18/parquet"
 	"github.com/apache/arrow/go/v18/parquet/file"
@@ -120,6 +119,15 @@ func (sm *SchemaManifest) GetFieldIndices(indices []int) ([]int, error) {
 	return ret, nil
 }
 
+// ExtensionCustomParquetType is an interface that Arrow ExtensionTypes may implement
+// to specify the target LogicalType to use when converting to Parquet.
+//
+// The PrimitiveType is not configurable, and is determined by a fixed mapping from
+// the extension's StorageType to a Parquet type (see getParquetType in pqarrow source).
+type ExtensionCustomParquetType interface {
+	ParquetLogicalType() schema.LogicalType
+}
+
 func isDictionaryReadSupported(dt arrow.DataType) bool {
 	return arrow.IsBinaryLike(dt.ID())
 }
@@ -250,104 +258,14 @@ func structToNode(typ *arrow.StructType, name string, nullable bool, props *parq
 }
 
 func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) {
-	var (
-		logicalType schema.LogicalType = schema.NoLogicalType{}
-		typ         parquet.Type
-		repType     = repFromNullable(field.Nullable)
-		length      = -1
-		precision   = -1
-		scale       = -1
-		err         error
-	)
+	repType := repFromNullable(field.Nullable)
 
+	// Handle complex types i.e. GroupNodes
 	switch field.Type.ID() {
 	case arrow.NULL:
-		typ = parquet.Types.Int32
-		logicalType = &schema.NullLogicalType{}
 		if repType != parquet.Repetitions.Optional {
 			return nil, xerrors.New("nulltype arrow field must be nullable")
 		}
-	case arrow.BOOL:
-		typ = parquet.Types.Boolean
-	case arrow.UINT8:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(8, false)
-	case arrow.INT8:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(8, true)
-	case arrow.UINT16:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(16, false)
-	case arrow.INT16:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(16, true)
-	case arrow.UINT32:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(32, false)
-	case arrow.INT32:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(32, true)
-	case arrow.UINT64:
-		typ = parquet.Types.Int64
-		logicalType = schema.NewIntLogicalType(64, false)
-	case arrow.INT64:
-		typ = parquet.Types.Int64
-		logicalType = schema.NewIntLogicalType(64, true)
-	case arrow.FLOAT32:
-		typ = parquet.Types.Float
-	case arrow.FLOAT64:
-		typ = parquet.Types.Double
-	case arrow.STRING, arrow.LARGE_STRING:
-		logicalType = schema.StringLogicalType{}
-		fallthrough
-	case arrow.BINARY, arrow.LARGE_BINARY:
-		typ = parquet.Types.ByteArray
-	case arrow.FIXED_SIZE_BINARY:
-		typ = parquet.Types.FixedLenByteArray
-		length = field.Type.(*arrow.FixedSizeBinaryType).ByteWidth
-	case arrow.DECIMAL, arrow.DECIMAL256:
-		dectype := field.Type.(arrow.DecimalType)
-		precision = int(dectype.GetPrecision())
-		scale = int(dectype.GetScale())
-
-		if props.StoreDecimalAsInteger() && 1 <= precision && precision <= 18 {
-			if precision <= 9 {
-				typ = parquet.Types.Int32
-			} else {
-				typ = parquet.Types.Int64
-			}
-		} else {
-			typ = parquet.Types.FixedLenByteArray
-			length = int(DecimalSize(int32(precision)))
-		}
-
-		logicalType = schema.NewDecimalLogicalType(int32(precision), int32(scale))
-	case arrow.DATE32:
-		typ = parquet.Types.Int32
-		logicalType = schema.DateLogicalType{}
-	case arrow.DATE64:
-		typ = parquet.Types.Int32
-		logicalType = schema.DateLogicalType{}
-	case arrow.TIMESTAMP:
-		typ, logicalType, err = getTimestampMeta(field.Type.(*arrow.TimestampType), props, arrprops)
-		if err != nil {
-			return nil, err
-		}
-	case arrow.TIME32:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMillis)
-	case arrow.TIME64:
-		typ = parquet.Types.Int64
-		timeType := field.Type.(*arrow.Time64Type)
-		if timeType.Unit == arrow.Nanosecond {
-			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitNanos)
-		} else {
-			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMicros)
-		}
-	case arrow.FLOAT16:
-		typ = parquet.Types.FixedLenByteArray
-		length = arrow.Float16SizeBytes
-		logicalType = schema.Float16LogicalType{}
 	case arrow.STRUCT:
 		return structToNode(field.Type.(*arrow.StructType), field.Name, field.Nullable, props, arrprops)
 	case arrow.FIXED_SIZE_LIST, arrow.LIST:
@@ -369,16 +287,6 @@ func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties
 		dictType := field.Type.(*arrow.DictionaryType)
 		return fieldToNode(name, arrow.Field{Name: name, Type: dictType.ValueType, Nullable: field.Nullable, Metadata: field.Metadata},
 			props, arrprops)
-	case arrow.EXTENSION:
-		return fieldToNode(name, arrow.Field{
-			Name:     name,
-			Type:     field.Type.(arrow.ExtensionType).StorageType(),
-			Nullable: field.Nullable,
-			Metadata: arrow.MetadataFrom(map[string]string{
-				ipc.ExtensionTypeKeyName:     field.Type.(arrow.ExtensionType).ExtensionName(),
-				ipc.ExtensionMetadataKeyName: field.Type.(arrow.ExtensionType).Serialize(),
-			}),
-		}, props, arrprops)
 	case arrow.MAP:
 		mapType := field.Type.(*arrow.MapType)
 		keyNode, err := fieldToNode("key", mapType.KeyField(), props, arrprops)
@@ -402,8 +310,12 @@ func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties
 			}, -1)
 		}
 		return schema.MapOf(field.Name, keyNode, valueNode, repFromNullable(field.Nullable), -1)
-	default:
-		return nil, fmt.Errorf("%w: support for %s", arrow.ErrNotImplemented, field.Type.ID())
+	}
+
+	// Not a GroupNode
+	typ, logicalType, length, err := getParquetType(field.Type, props, arrprops)
+	if err != nil {
+		return nil, err
 	}
 
 	return schema.NewPrimitiveNodeLogical(name, repType, logicalType, typ, length, fieldIDFromMeta(field.Metadata))
@@ -472,7 +384,7 @@ func (s schemaTree) RecordLeaf(leaf *SchemaField) {
 	s.manifest.ColIndexToField[leaf.ColIndex] = leaf
 }
 
-func arrowInt(log *schema.IntLogicalType) (arrow.DataType, error) {
+func arrowInt(log schema.IntLogicalType) (arrow.DataType, error) {
 	switch log.BitWidth() {
 	case 8:
 		if log.IsSigned() {
@@ -499,7 +411,7 @@ func arrowInt(log *schema.IntLogicalType) (arrow.DataType, error) {
 	}
 }
 
-func arrowTime32(logical *schema.TimeLogicalType) (arrow.DataType, error) {
+func arrowTime32(logical schema.TimeLogicalType) (arrow.DataType, error) {
 	if logical.TimeUnit() == schema.TimeUnitMillis {
 		return arrow.FixedWidthTypes.Time32ms, nil
 	}
@@ -507,7 +419,7 @@ func arrowTime32(logical *schema.TimeLogicalType) (arrow.DataType, error) {
 	return nil, xerrors.New(logical.String() + " cannot annotate a time32")
 }
 
-func arrowTime64(logical *schema.TimeLogicalType) (arrow.DataType, error) {
+func arrowTime64(logical schema.TimeLogicalType) (arrow.DataType, error) {
 	switch logical.TimeUnit() {
 	case schema.TimeUnitMicros:
 		return arrow.FixedWidthTypes.Time64us, nil
@@ -518,7 +430,7 @@ func arrowTime64(logical *schema.TimeLogicalType) (arrow.DataType, error) {
 	}
 }
 
-func arrowTimestamp(logical *schema.TimestampLogicalType) (arrow.DataType, error) {
+func arrowTimestamp(logical schema.TimestampLogicalType) (arrow.DataType, error) {
 	tz := ""
 
 	// ConvertedTypes are adjusted to UTC per backward compatibility guidelines
@@ -539,7 +451,7 @@ func arrowTimestamp(logical *schema.TimestampLogicalType) (arrow.DataType, error
 	}
 }
 
-func arrowDecimal(logical *schema.DecimalLogicalType) arrow.DataType {
+func arrowDecimal(logical schema.DecimalLogicalType) arrow.DataType {
 	if logical.Precision() <= decimal128.MaxPrecision {
 		return &arrow.Decimal128Type{Precision: logical.Precision(), Scale: logical.Scale()}
 	}
@@ -550,11 +462,11 @@ func arrowFromInt32(logical schema.LogicalType) (arrow.DataType, error) {
 	switch logtype := logical.(type) {
 	case schema.NoLogicalType:
 		return arrow.PrimitiveTypes.Int32, nil
-	case *schema.TimeLogicalType:
+	case schema.TimeLogicalType:
 		return arrowTime32(logtype)
-	case *schema.DecimalLogicalType:
+	case schema.DecimalLogicalType:
 		return arrowDecimal(logtype), nil
-	case *schema.IntLogicalType:
+	case schema.IntLogicalType:
 		return arrowInt(logtype)
 	case schema.DateLogicalType:
 		return arrow.FixedWidthTypes.Date32, nil
@@ -569,13 +481,13 @@ func arrowFromInt64(logical schema.LogicalType) (arrow.DataType, error) {
 	}
 
 	switch logtype := logical.(type) {
-	case *schema.IntLogicalType:
+	case schema.IntLogicalType:
 		return arrowInt(logtype)
-	case *schema.DecimalLogicalType:
+	case schema.DecimalLogicalType:
 		return arrowDecimal(logtype), nil
-	case *schema.TimeLogicalType:
+	case schema.TimeLogicalType:
 		return arrowTime64(logtype)
-	case *schema.TimestampLogicalType:
+	case schema.TimestampLogicalType:
 		return arrowTimestamp(logtype)
 	default:
 		return nil, xerrors.New(logical.String() + " cannot annotate int64")
@@ -586,7 +498,7 @@ func arrowFromByteArray(logical schema.LogicalType) (arrow.DataType, error) {
 	switch logtype := logical.(type) {
 	case schema.StringLogicalType:
 		return arrow.BinaryTypes.String, nil
-	case *schema.DecimalLogicalType:
+	case schema.DecimalLogicalType:
 		return arrowDecimal(logtype), nil
 	case schema.NoLogicalType,
 		schema.EnumLogicalType,
@@ -600,7 +512,7 @@ func arrowFromByteArray(logical schema.LogicalType) (arrow.DataType, error) {
 
 func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, error) {
 	switch logtype := logical.(type) {
-	case *schema.DecimalLogicalType:
+	case schema.DecimalLogicalType:
 		return arrowDecimal(logtype), nil
 	case schema.NoLogicalType, schema.IntervalLogicalType, schema.UUIDLogicalType:
 		return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil
@@ -611,6 +523,84 @@ func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, erro
 	}
 }
 
+func getParquetType(typ arrow.DataType, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (parquet.Type, schema.LogicalType, int, error) {
+	switch typ.ID() {
+	case arrow.NULL:
+		return parquet.Types.Int32, schema.NullLogicalType{}, -1, nil
+	case arrow.BOOL:
+		return parquet.Types.Boolean, schema.NoLogicalType{}, -1, nil
+	case arrow.UINT8:
+		return parquet.Types.Int32, schema.NewIntLogicalType(8, false), -1, nil
+	case arrow.INT8:
+		return parquet.Types.Int32, schema.NewIntLogicalType(8, true), -1, nil
+	case arrow.UINT16:
+		return parquet.Types.Int32, schema.NewIntLogicalType(16, false), -1, nil
+	case arrow.INT16:
+		return parquet.Types.Int32, schema.NewIntLogicalType(16, true), -1, nil
+	case arrow.UINT32:
+		return parquet.Types.Int32, schema.NewIntLogicalType(32, false), -1, nil
+	case arrow.INT32:
+		return parquet.Types.Int32, schema.NewIntLogicalType(32, true), -1, nil
+	case arrow.UINT64:
+		return parquet.Types.Int64, schema.NewIntLogicalType(64, false), -1, nil
+	case arrow.INT64:
+		return parquet.Types.Int64, schema.NewIntLogicalType(64, true), -1, nil
+	case arrow.FLOAT32:
+		return parquet.Types.Float, schema.NoLogicalType{}, -1, nil
+	case arrow.FLOAT64:
+		return parquet.Types.Double, schema.NoLogicalType{}, -1, nil
+	case arrow.STRING, arrow.LARGE_STRING:
+		return parquet.Types.ByteArray, schema.StringLogicalType{}, -1, nil
+	case arrow.BINARY, arrow.LARGE_BINARY:
+		return parquet.Types.ByteArray, schema.NoLogicalType{}, -1, nil
+	case arrow.FIXED_SIZE_BINARY:
+		return parquet.Types.FixedLenByteArray, schema.NoLogicalType{}, typ.(*arrow.FixedSizeBinaryType).ByteWidth, nil
+	case arrow.DECIMAL, arrow.DECIMAL256:
+		dectype := typ.(arrow.DecimalType)
+		precision := int(dectype.GetPrecision())
+		scale := int(dectype.GetScale())
+
+		if !props.StoreDecimalAsInteger() || precision > 18 {
+			return parquet.Types.FixedLenByteArray, schema.NewDecimalLogicalType(int32(precision), int32(scale)), int(DecimalSize(int32(precision))), nil
+		}
+
+		pqType := parquet.Types.Int32
+		if precision > 9 {
+			pqType = parquet.Types.Int64
+		}
+
+		return pqType, schema.NoLogicalType{}, -1, nil
+	case arrow.DATE32:
+		return parquet.Types.Int32, schema.DateLogicalType{}, -1, nil
+	case arrow.DATE64:
+		return parquet.Types.Int32, schema.DateLogicalType{}, -1, nil
+	case arrow.TIMESTAMP:
+		pqType, logicalType, err := getTimestampMeta(typ.(*arrow.TimestampType), props, arrprops)
+		return pqType, logicalType, -1, err
+	case arrow.TIME32:
+		return parquet.Types.Int32, schema.NewTimeLogicalType(true, schema.TimeUnitMillis), -1, nil
+	case arrow.TIME64:
+		pqTimeUnit := schema.TimeUnitMicros
+		if typ.(*arrow.Time64Type).Unit == arrow.Nanosecond {
+			pqTimeUnit = schema.TimeUnitNanos
+		}
+
+		return parquet.Types.Int64, schema.NewTimeLogicalType(true, pqTimeUnit), -1, nil
+	case arrow.FLOAT16:
+		return parquet.Types.FixedLenByteArray, schema.Float16LogicalType{}, arrow.Float16SizeBytes, nil
+	case arrow.EXTENSION:
+		storageType := typ.(arrow.ExtensionType).StorageType()
+		pqType, logicalType, length, err := getParquetType(storageType, props, arrprops)
+		if withCustomType, ok := typ.(ExtensionCustomParquetType); ok {
+			logicalType = withCustomType.ParquetLogicalType()
+		}
+
+		return pqType, logicalType, length, err
+	default:
+		return parquet.Type(0), nil, 0, fmt.Errorf("%w: support for %s", arrow.ErrNotImplemented, typ.ID())
+	}
+}
+
 func getArrowType(physical parquet.Type, logical schema.LogicalType, typeLen int) (arrow.DataType, error) {
 	if !logical.IsValid() || logical.Equals(schema.NullLogicalType{}) {
 		return arrow.Null, nil
diff --git a/go/parquet/pqarrow/schema_test.go b/go/parquet/pqarrow/schema_test.go
index 24b031c174b..528200fd0e7 100644
--- a/go/parquet/pqarrow/schema_test.go
+++ b/go/parquet/pqarrow/schema_test.go
@@ -21,10 +21,10 @@ import (
 	"testing"
 
 	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/flight"
 	"github.com/apache/arrow/go/v18/arrow/ipc"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"github.com/apache/arrow/go/v18/parquet"
 	"github.com/apache/arrow/go/v18/parquet/metadata"
 	"github.com/apache/arrow/go/v18/parquet/pqarrow"
@@ -34,7 +34,7 @@ import (
 )
 
 func TestGetOriginSchemaBase64(t *testing.T) {
-	uuidType := types.NewUUIDType()
+	uuidType := extensions.NewUUIDType()
 	md := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
 	extMd := arrow.NewMetadata([]string{ipc.ExtensionMetadataKeyName, ipc.ExtensionTypeKeyName, "PARQUET:field_id"}, []string{uuidType.Serialize(), uuidType.ExtensionName(), "-1"})
 	origArrSc := arrow.NewSchema([]arrow.Field{
@@ -44,10 +44,6 @@ func TestGetOriginSchemaBase64(t *testing.T) {
 	}, nil)
 
 	arrSerializedSc := flight.SerializeSchema(origArrSc, memory.DefaultAllocator)
-	if err := arrow.RegisterExtensionType(uuidType); err != nil {
-		t.Fatal(err)
-	}
-	defer arrow.UnregisterExtensionType(uuidType.ExtensionName())
 	pqschema, err := pqarrow.ToParquet(origArrSc, nil, pqarrow.DefaultWriterProps())
 	require.NoError(t, err)
 
@@ -71,11 +67,7 @@ func TestGetOriginSchemaBase64(t *testing.T) {
 }
 
 func TestGetOriginSchemaUnregisteredExtension(t *testing.T) {
-	uuidType := types.NewUUIDType()
-	if err := arrow.RegisterExtensionType(uuidType); err != nil {
-		t.Fatal(err)
-	}
-
+	uuidType := extensions.NewUUIDType()
 	md := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
 	origArrSc := arrow.NewSchema([]arrow.Field{
 		{Name: "f1", Type: arrow.BinaryTypes.String, Metadata: md},
@@ -90,6 +82,7 @@ func TestGetOriginSchemaUnregisteredExtension(t *testing.T) {
 	kv.Append("ARROW:schema", base64.StdEncoding.EncodeToString(arrSerializedSc))
 
 	arrow.UnregisterExtensionType(uuidType.ExtensionName())
+	defer arrow.RegisterExtensionType(uuidType)
 	arrsc, err := pqarrow.FromParquet(pqschema, nil, kv)
 	require.NoError(t, err)
 
diff --git a/go/parquet/schema/converted_types.go b/go/parquet/schema/converted_types.go
index 5fc10f61ceb..b2b6f50cbf6 100644
--- a/go/parquet/schema/converted_types.go
+++ b/go/parquet/schema/converted_types.go
@@ -113,13 +113,9 @@ func (p ConvertedType) ToLogicalType(convertedDecimal DecimalMetadata) LogicalTy
 	case ConvertedTypes.TimeMicros:
 		return NewTimeLogicalType(true /* adjustedToUTC */, TimeUnitMicros)
 	case ConvertedTypes.TimestampMillis:
-		t := NewTimestampLogicalType(true /* adjustedToUTC */, TimeUnitMillis)
-		t.(*TimestampLogicalType).fromConverted = true
-		return t
+		return NewTimestampLogicalTypeWithOpts(WithTSIsAdjustedToUTC(), WithTSTimeUnitType(TimeUnitMillis), WithTSFromConverted())
 	case ConvertedTypes.TimestampMicros:
-		t := NewTimestampLogicalType(true /* adjustedToUTC */, TimeUnitMicros)
-		t.(*TimestampLogicalType).fromConverted = true
-		return t
+		return NewTimestampLogicalTypeWithOpts(WithTSIsAdjustedToUTC(), WithTSTimeUnitType(TimeUnitMicros), WithTSFromConverted())
 	case ConvertedTypes.Interval:
 		return IntervalLogicalType{}
 	case ConvertedTypes.Int8:
diff --git a/go/parquet/schema/logical_types.go b/go/parquet/schema/logical_types.go
index e8adce1ca14..fa46ea0172f 100644
--- a/go/parquet/schema/logical_types.go
+++ b/go/parquet/schema/logical_types.go
@@ -45,21 +45,21 @@ func getLogicalType(l *format.LogicalType) LogicalType {
 	case l.IsSetENUM():
 		return EnumLogicalType{}
 	case l.IsSetDECIMAL():
-		return &DecimalLogicalType{typ: l.DECIMAL}
+		return DecimalLogicalType{typ: l.DECIMAL}
 	case l.IsSetDATE():
 		return DateLogicalType{}
 	case l.IsSetTIME():
 		if timeUnitFromThrift(l.TIME.Unit) == TimeUnitUnknown {
 			panic("parquet: TimeUnit must be one of MILLIS, MICROS, or NANOS for Time logical type")
 		}
-		return &TimeLogicalType{typ: l.TIME}
+		return TimeLogicalType{typ: l.TIME}
 	case l.IsSetTIMESTAMP():
 		if timeUnitFromThrift(l.TIMESTAMP.Unit) == TimeUnitUnknown {
 			panic("parquet: TimeUnit must be one of MILLIS, MICROS, or NANOS for Timestamp logical type")
 		}
-		return &TimestampLogicalType{typ: l.TIMESTAMP}
+		return TimestampLogicalType{typ: l.TIMESTAMP}
 	case l.IsSetINTEGER():
-		return &IntLogicalType{typ: l.INTEGER}
+		return IntLogicalType{typ: l.INTEGER}
 	case l.IsSetUNKNOWN():
 		return NullLogicalType{}
 	case l.IsSetJSON():
@@ -344,7 +344,7 @@ func NewDecimalLogicalType(precision int32, scale int32) LogicalType {
 	if scale < 0 || scale > precision {
 		panic("parquet: scale must be a non-negative integer that does not exceed precision for decimal logical type")
 	}
-	return &DecimalLogicalType{typ: &format.DecimalType{Precision: precision, Scale: scale}}
+	return DecimalLogicalType{typ: &format.DecimalType{Precision: precision, Scale: scale}}
 }
 
 // DecimalLogicalType is used to represent a decimal value of a given
@@ -405,7 +405,7 @@ func (t DecimalLogicalType) toThrift() *format.LogicalType {
 }
 
 func (t DecimalLogicalType) Equals(rhs LogicalType) bool {
-	other, ok := rhs.(*DecimalLogicalType)
+	other, ok := rhs.(DecimalLogicalType)
 	if !ok {
 		return false
 	}
@@ -509,7 +509,7 @@ func createTimeUnit(unit TimeUnitType) *format.TimeUnit {
 
 // NewTimeLogicalType returns a time type of the given unit.
 func NewTimeLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType {
-	return &TimeLogicalType{typ: &format.TimeType{
+	return TimeLogicalType{typ: &format.TimeType{
 		IsAdjustedToUTC: isAdjustedToUTC,
 		Unit:            createTimeUnit(unit),
 	}}
@@ -584,7 +584,7 @@ func (t TimeLogicalType) toThrift() *format.LogicalType {
 }
 
 func (t TimeLogicalType) Equals(rhs LogicalType) bool {
-	other, ok := rhs.(*TimeLogicalType)
+	other, ok := rhs.(TimeLogicalType)
 	if !ok {
 		return false
 	}
@@ -595,7 +595,7 @@ func (t TimeLogicalType) Equals(rhs LogicalType) bool {
 // NewTimestampLogicalType returns a logical timestamp type with "forceConverted"
 // set to false
 func NewTimestampLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType {
-	return &TimestampLogicalType{
+	return TimestampLogicalType{
 		typ: &format.TimestampType{
 			IsAdjustedToUTC: isAdjustedToUTC,
 			Unit:            createTimeUnit(unit),
@@ -608,7 +608,7 @@ func NewTimestampLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalTyp
 // NewTimestampLogicalTypeForce returns a timestamp logical type with
 // "forceConverted" set to true
 func NewTimestampLogicalTypeForce(isAdjustedToUTC bool, unit TimeUnitType) LogicalType {
-	return &TimestampLogicalType{
+	return TimestampLogicalType{
 		typ: &format.TimestampType{
 			IsAdjustedToUTC: isAdjustedToUTC,
 			Unit:            createTimeUnit(unit),
@@ -654,14 +654,14 @@ func WithTSFromConverted() TimestampOpt {
 //
 // TimestampType Unit defaults to milliseconds (TimeUnitMillis)
 func NewTimestampLogicalTypeWithOpts(opts ...TimestampOpt) LogicalType {
-	ts := &TimestampLogicalType{
+	ts := TimestampLogicalType{
 		typ: &format.TimestampType{
 			Unit: createTimeUnit(TimeUnitMillis), // default to milliseconds
 		},
 	}
 
 	for _, o := range opts {
-		o(ts)
+		o(&ts)
 	}
 
 	return ts
@@ -760,7 +760,7 @@ func (t TimestampLogicalType) toThrift() *format.LogicalType {
 }
 
 func (t TimestampLogicalType) Equals(rhs LogicalType) bool {
-	other, ok := rhs.(*TimestampLogicalType)
+	other, ok := rhs.(TimestampLogicalType)
 	if !ok {
 		return false
 	}
@@ -778,7 +778,7 @@ func NewIntLogicalType(bitWidth int8, signed bool) LogicalType {
 	default:
 		panic("parquet: bit width must be exactly 8, 16, 32, or 64 for Int logical type")
 	}
-	return &IntLogicalType{
+	return IntLogicalType{
 		typ: &format.IntType{
 			BitWidth: bitWidth,
 			IsSigned: signed,
@@ -864,7 +864,7 @@ func (t IntLogicalType) toThrift() *format.LogicalType {
 }
 
 func (t IntLogicalType) Equals(rhs LogicalType) bool {
-	other, ok := rhs.(*IntLogicalType)
+	other, ok := rhs.(IntLogicalType)
 	if !ok {
 		return false
 	}
diff --git a/go/parquet/schema/logical_types_test.go b/go/parquet/schema/logical_types_test.go
index e33925966e1..395d1504182 100644
--- a/go/parquet/schema/logical_types_test.go
+++ b/go/parquet/schema/logical_types_test.go
@@ -38,18 +38,18 @@ func TestConvertedLogicalEquivalences(t *testing.T) {
 		{"list", schema.ConvertedTypes.List, schema.NewListLogicalType(), schema.NewListLogicalType()},
 		{"enum", schema.ConvertedTypes.Enum, schema.EnumLogicalType{}, schema.EnumLogicalType{}},
 		{"date", schema.ConvertedTypes.Date, schema.DateLogicalType{}, schema.DateLogicalType{}},
-		{"timemilli", schema.ConvertedTypes.TimeMillis, schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitMillis), &schema.TimeLogicalType{}},
-		{"timemicro", schema.ConvertedTypes.TimeMicros, schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitMicros), &schema.TimeLogicalType{}},
-		{"timestampmilli", schema.ConvertedTypes.TimestampMillis, schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitMillis), &schema.TimestampLogicalType{}},
-		{"timestampmicro", schema.ConvertedTypes.TimestampMicros, schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitMicros), &schema.TimestampLogicalType{}},
-		{"uint8", schema.ConvertedTypes.Uint8, schema.NewIntLogicalType(8 /* bitWidth */, false /* signed */), &schema.IntLogicalType{}},
-		{"uint16", schema.ConvertedTypes.Uint16, schema.NewIntLogicalType(16 /* bitWidth */, false /* signed */), &schema.IntLogicalType{}},
-		{"uint32", schema.ConvertedTypes.Uint32, schema.NewIntLogicalType(32 /* bitWidth */, false /* signed */), &schema.IntLogicalType{}},
-		{"uint64", schema.ConvertedTypes.Uint64, schema.NewIntLogicalType(64 /* bitWidth */, false /* signed */), &schema.IntLogicalType{}},
-		{"int8", schema.ConvertedTypes.Int8, schema.NewIntLogicalType(8 /* bitWidth */, true /* signed */), &schema.IntLogicalType{}},
-		{"int16", schema.ConvertedTypes.Int16, schema.NewIntLogicalType(16 /* bitWidth */, true /* signed */), &schema.IntLogicalType{}},
-		{"int32", schema.ConvertedTypes.Int32, schema.NewIntLogicalType(32 /* bitWidth */, true /* signed */), &schema.IntLogicalType{}},
-		{"int64", schema.ConvertedTypes.Int64, schema.NewIntLogicalType(64 /* bitWidth */, true /* signed */), &schema.IntLogicalType{}},
+		{"timemilli", schema.ConvertedTypes.TimeMillis, schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitMillis), schema.TimeLogicalType{}},
+		{"timemicro", schema.ConvertedTypes.TimeMicros, schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitMicros), schema.TimeLogicalType{}},
+		{"timestampmilli", schema.ConvertedTypes.TimestampMillis, schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitMillis), schema.TimestampLogicalType{}},
+		{"timestampmicro", schema.ConvertedTypes.TimestampMicros, schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitMicros), schema.TimestampLogicalType{}},
+		{"uint8", schema.ConvertedTypes.Uint8, schema.NewIntLogicalType(8 /* bitWidth */, false /* signed */), schema.IntLogicalType{}},
+		{"uint16", schema.ConvertedTypes.Uint16, schema.NewIntLogicalType(16 /* bitWidth */, false /* signed */), schema.IntLogicalType{}},
+		{"uint32", schema.ConvertedTypes.Uint32, schema.NewIntLogicalType(32 /* bitWidth */, false /* signed */), schema.IntLogicalType{}},
+		{"uint64", schema.ConvertedTypes.Uint64, schema.NewIntLogicalType(64 /* bitWidth */, false /* signed */), schema.IntLogicalType{}},
+		{"int8", schema.ConvertedTypes.Int8, schema.NewIntLogicalType(8 /* bitWidth */, true /* signed */), schema.IntLogicalType{}},
+		{"int16", schema.ConvertedTypes.Int16, schema.NewIntLogicalType(16 /* bitWidth */, true /* signed */), schema.IntLogicalType{}},
+		{"int32", schema.ConvertedTypes.Int32, schema.NewIntLogicalType(32 /* bitWidth */, true /* signed */), schema.IntLogicalType{}},
+		{"int64", schema.ConvertedTypes.Int64, schema.NewIntLogicalType(64 /* bitWidth */, true /* signed */), schema.IntLogicalType{}},
 		{"json", schema.ConvertedTypes.JSON, schema.JSONLogicalType{}, schema.JSONLogicalType{}},
 		{"bson", schema.ConvertedTypes.BSON, schema.BSONLogicalType{}, schema.BSONLogicalType{}},
 		{"interval", schema.ConvertedTypes.Interval, schema.IntervalLogicalType{}, schema.IntervalLogicalType{}},
@@ -72,8 +72,8 @@ func TestConvertedLogicalEquivalences(t *testing.T) {
 		fromMake := schema.NewDecimalLogicalType(10, 4)
 		assert.IsType(t, fromMake, fromConverted)
 		assert.True(t, fromConverted.Equals(fromMake))
-		assert.IsType(t, &schema.DecimalLogicalType{}, fromConverted)
-		assert.IsType(t, &schema.DecimalLogicalType{}, fromMake)
+		assert.IsType(t, schema.DecimalLogicalType{}, fromConverted)
+		assert.IsType(t, schema.DecimalLogicalType{}, fromMake)
 		assert.True(t, schema.NewDecimalLogicalType(16, 0).Equals(schema.NewDecimalLogicalType(16, 0)))
 	})
 }
@@ -160,12 +160,12 @@ func TestNewTypeIncompatibility(t *testing.T) {
 		{"uuid", schema.UUIDLogicalType{}, schema.UUIDLogicalType{}},
 		{"float16", schema.Float16LogicalType{}, schema.Float16LogicalType{}},
 		{"null", schema.NullLogicalType{}, schema.NullLogicalType{}},
-		{"not-utc-time_milli", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitMillis), &schema.TimeLogicalType{}},
-		{"not-utc-time-micro", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitMicros), &schema.TimeLogicalType{}},
-		{"not-utc-time-nano", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimeLogicalType{}},
-		{"utc-time-nano", schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimeLogicalType{}},
-		{"not-utc-timestamp-nano", schema.NewTimestampLogicalType(false /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimestampLogicalType{}},
-		{"utc-timestamp-nano", schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimestampLogicalType{}},
+		{"not-utc-time_milli", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitMillis), schema.TimeLogicalType{}},
+		{"not-utc-time-micro", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitMicros), schema.TimeLogicalType{}},
+		{"not-utc-time-nano", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitNanos), schema.TimeLogicalType{}},
+		{"utc-time-nano", schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitNanos), schema.TimeLogicalType{}},
+		{"not-utc-timestamp-nano", schema.NewTimestampLogicalType(false /* adjustedToUTC */, schema.TimeUnitNanos), schema.TimestampLogicalType{}},
+		{"utc-timestamp-nano", schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitNanos), schema.TimestampLogicalType{}},
 	}
 
 	for _, tt := range tests {
diff --git a/go/parquet/schema/schema_element_test.go b/go/parquet/schema/schema_element_test.go
index 7da55ce93ab..e427ba6485e 100644
--- a/go/parquet/schema/schema_element_test.go
+++ b/go/parquet/schema/schema_element_test.go
@@ -192,7 +192,7 @@ func (s *SchemaElementConstructionSuite) TestSimple() {
 
 func (s *SchemaElementConstructionSuite) reconstructDecimal(c schemaElementConstructArgs) *decimalSchemaElementConstruction {
 	ret := s.reconstruct(c)
-	dec := c.logical.(*DecimalLogicalType)
+	dec := c.logical.(DecimalLogicalType)
 	return &decimalSchemaElementConstruction{*ret, int(dec.Precision()), int(dec.Scale())}
 }
 
@@ -359,7 +359,7 @@ func (s *SchemaElementConstructionSuite) TestTemporal() {
 
 func (s *SchemaElementConstructionSuite) reconstructInteger(c schemaElementConstructArgs) *intSchemaElementConstruction {
 	base := s.reconstruct(c)
-	l := c.logical.(*IntLogicalType)
+	l := c.logical.(IntLogicalType)
 	return &intSchemaElementConstruction{
 		*base,
 		l.BitWidth(),

From 82ecf3e6ed8cb58a08d600041617ce85c9bdb7c1 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 22 Aug 2024 22:57:14 +0200
Subject: [PATCH 063/157] MINOR: [CI][C++][Python] Fix Cuda builds on git main
 (#43789)

On the Cuda self-hosted runners, we need to use legacy `docker-compose` on all Archery Docker invocations, including the "image push" step. This is because the Docker client version on those runners is too old to accept the `--file` option to the `compose` subcommand.

This is a followup to https://github.com/apache/arrow/pull/43586 . The image push step cannot easily be verified in a PR, hence this second PR.

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/tasks/docker-tests/github.cuda.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/tasks/docker-tests/github.cuda.yml b/dev/tasks/docker-tests/github.cuda.yml
index 9c7adf53a6f..8c04da8a91a 100644
--- a/dev/tasks/docker-tests/github.cuda.yml
+++ b/dev/tasks/docker-tests/github.cuda.yml
@@ -26,6 +26,8 @@ jobs:
     runs-on: ['self-hosted', 'cuda']
 {{ macros.github_set_env(env) }}
     timeout-minutes: {{ timeout|default(60) }}
+    env:
+      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: 1
     steps:
       {{ macros.github_checkout_arrow(fetch_depth=fetch_depth|default(1))|indent }}
       # python 3.8 is installed on the runner, no need to install
@@ -34,7 +36,6 @@ jobs:
       - name: Execute Docker Build
         shell: bash
         env:
-          ARCHERY_USE_LEGACY_DOCKER_COMPOSE: 1
         {{ macros.github_set_sccache_envvars()|indent(8) }}
         run: |
           archery docker run \

From bad064f705ec9fc72efac2d13a1fc3fac6d3d137 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 22 Aug 2024 14:08:26 -0700
Subject: [PATCH 064/157] MINOR: [C++] Ensure setting the default
 CMAKE_BUILD_TYPE (#43794)

### Rationale for this change

The current logic for detecting whether the `CMAKE_BUILD_TYPE` is set is incorrect. That variable is never fully undefined; by default, in cases where it is unset is actually set to the empty string. Therefore, the condition that must be checked is not whether the variable is defined, but whether it tests to a truthy value (i.e. is a non-empty string).

I consider this a minor change so I have not opened an associated issue.

### What changes are included in this PR?

This PR changes `if(NOT DEFINED CMAKE_BUILD_TYPE)` to `if(NOT CMAKE_BUILD_TYPE)`.

### Are these changes tested?

Since this fixes a particular CMake build scenario I am not sure if a test is merited, or where one would be added.

### Are there any user-facing changes?

No.

Authored-by: Vyas Ramasubramani <vyasr@nvidia.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/CMakeLists.txt                        | 2 +-
 cpp/examples/minimal_build/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a1e3138da9e..5ead9e4b063 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -84,7 +84,7 @@ set(ARROW_VERSION "18.0.0-SNAPSHOT")
 string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}")
 
 # if no build type is specified, default to release builds
-if(NOT DEFINED CMAKE_BUILD_TYPE)
+if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE
       Release
       CACHE STRING "Choose the type of build.")
diff --git a/cpp/examples/minimal_build/CMakeLists.txt b/cpp/examples/minimal_build/CMakeLists.txt
index b4a7cde938c..95dad34221a 100644
--- a/cpp/examples/minimal_build/CMakeLists.txt
+++ b/cpp/examples/minimal_build/CMakeLists.txt
@@ -30,7 +30,7 @@ endif()
 # We require a C++17 compliant compiler
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-if(NOT DEFINED CMAKE_BUILD_TYPE)
+if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 

From 53b15b61691dde1ea86e14b7a2216fa0a26f8054 Mon Sep 17 00:00:00 2001
From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Date: Fri, 23 Aug 2024 16:17:29 -0400
Subject: [PATCH 065/157] MINOR: [Go] Fix Flakey
 TestRowsPrematureCloseDuringNextLoop Test (#43804)

### Rationale for this change

Fixes a race condition in rows initialization that has been causing intermittent test failures.

### What changes are included in this PR?

Split query and init context. Update test to check for failure _after_ reading rows.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Joel Lubinitsky <joellubi@gmail.com>
---
 go/arrow/flight/flightsql/driver/driver.go      | 10 ++++++----
 go/arrow/flight/flightsql/driver/driver_test.go |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/go/arrow/flight/flightsql/driver/driver.go b/go/arrow/flight/flightsql/driver/driver.go
index 0f2b02deaca..0513fe1ecd3 100644
--- a/go/arrow/flight/flightsql/driver/driver.go
+++ b/go/arrow/flight/flightsql/driver/driver.go
@@ -266,13 +266,14 @@ func (s *Stmt) QueryContext(ctx context.Context, args []driver.NamedValue) (driv
 		return nil, err
 	}
 
+	execCtx := ctx
 	if _, set := ctx.Deadline(); !set && s.timeout > 0 {
 		var cancel context.CancelFunc
-		ctx, cancel = context.WithTimeout(ctx, s.timeout)
+		execCtx, cancel = context.WithTimeout(ctx, s.timeout)
 		defer cancel()
 	}
 
-	info, err := s.stmt.Execute(ctx)
+	info, err := s.stmt.Execute(execCtx)
 	if err != nil {
 		return nil, err
 	}
@@ -497,13 +498,14 @@ func (c *Connection) QueryContext(ctx context.Context, query string, args []driv
 		return nil, driver.ErrSkip
 	}
 
+	execCtx := ctx
 	if _, set := ctx.Deadline(); !set && c.timeout > 0 {
 		var cancel context.CancelFunc
-		ctx, cancel = context.WithTimeout(ctx, c.timeout)
+		execCtx, cancel = context.WithTimeout(ctx, c.timeout)
 		defer cancel()
 	}
 
-	info, err := c.client.Execute(ctx, query)
+	info, err := c.client.Execute(execCtx, query)
 	if err != nil {
 		return nil, err
 	}
diff --git a/go/arrow/flight/flightsql/driver/driver_test.go b/go/arrow/flight/flightsql/driver/driver_test.go
index e5060ccbe33..c00dfe3c5d9 100644
--- a/go/arrow/flight/flightsql/driver/driver_test.go
+++ b/go/arrow/flight/flightsql/driver/driver_test.go
@@ -626,7 +626,6 @@ func (s *SqlTestSuite) TestRowsPrematureCloseDuringNextLoop() {
 	rows, err := db.QueryContext(context.TODO(), sqlSelectAll)
 	require.NoError(t, err)
 	require.NotNil(t, rows)
-	require.NoError(t, rows.Err())
 
 	const closeAfterNRows = 10
 	var (
@@ -645,6 +644,7 @@ func (s *SqlTestSuite) TestRowsPrematureCloseDuringNextLoop() {
 			require.NoError(t, rows.Close())
 		}
 	}
+	require.NoError(t, rows.Err())
 
 	require.Equal(t, closeAfterNRows, i)
 

From cb645a1b27dd66fddb88458c939e2851f9dadf35 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 24 Aug 2024 06:08:18 +0900
Subject: [PATCH 066/157] GH-43802: [GLib] Add `GAFlightRecordBatchWriter`
 (#43803)

### Rationale for this change

This is needed to implement `DoPut`.

### What changes are included in this PR?

We can't add tests for it because it's an abstract class.

I'm not sure `is_owner` is needed like
`GAFlightRecordBatchReader`. `is_owner` may be removed later if we find that it's needless.

### Are these changes tested?

No.

### Are there any user-facing changes?

Yes.

`GAFlightRecordBatchWriter` is a new public API.
* GitHub Issue: #43802

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/arrow-flight-glib/common.cpp | 198 ++++++++++++++++++++++++++--
 c_glib/arrow-flight-glib/common.h   |  32 +++++
 c_glib/arrow-flight-glib/common.hpp |   4 +
 3 files changed, 224 insertions(+), 10 deletions(-)

diff --git a/c_glib/arrow-flight-glib/common.cpp b/c_glib/arrow-flight-glib/common.cpp
index efc544f10cf..f7eea08c264 100644
--- a/c_glib/arrow-flight-glib/common.cpp
+++ b/c_glib/arrow-flight-glib/common.cpp
@@ -48,7 +48,11 @@ G_BEGIN_DECLS
  *
  * #GAFlightStreamChunk is a class for a chunk in stream.
  *
- * #GAFlightRecordBatchReader is a class for reading record batches.
+ * #GAFlightRecordBatchReader is an abstract class for reading record
+ * batches with metadata.
+ *
+ * #GAFlightRecordBatchWeriter is an abstract class for
+ * writing record batches with metadata.
  *
  * Since: 5.0.0
  */
@@ -1172,13 +1176,13 @@ typedef struct GAFlightRecordBatchReaderPrivate_
 } GAFlightRecordBatchReaderPrivate;
 
 enum {
-  PROP_READER = 1,
-  PROP_IS_OWNER,
+  PROP_RECORD_BATCH_READER_READER = 1,
+  PROP_RECORD_BATCH_READER_IS_OWNER,
 };
 
-G_DEFINE_TYPE_WITH_PRIVATE(GAFlightRecordBatchReader,
-                           gaflight_record_batch_reader,
-                           G_TYPE_OBJECT)
+G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GAFlightRecordBatchReader,
+                                    gaflight_record_batch_reader,
+                                    G_TYPE_OBJECT)
 
 #define GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(obj)                                    \
   static_cast<GAFlightRecordBatchReaderPrivate *>(                                       \
@@ -1204,11 +1208,11 @@ gaflight_record_batch_reader_set_property(GObject *object,
   auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(object);
 
   switch (prop_id) {
-  case PROP_READER:
+  case PROP_RECORD_BATCH_READER_READER:
     priv->reader =
       static_cast<arrow::flight::MetadataRecordBatchReader *>(g_value_get_pointer(value));
     break;
-  case PROP_IS_OWNER:
+  case PROP_RECORD_BATCH_READER_IS_OWNER:
     priv->is_owner = g_value_get_boolean(value);
     break;
   default:
@@ -1236,7 +1240,7 @@ gaflight_record_batch_reader_class_init(GAFlightRecordBatchReaderClass *klass)
     nullptr,
     nullptr,
     static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_READER, spec);
+  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_READER_READER, spec);
 
   spec = g_param_spec_boolean(
     "is-owner",
@@ -1244,7 +1248,7 @@ gaflight_record_batch_reader_class_init(GAFlightRecordBatchReaderClass *klass)
     nullptr,
     TRUE,
     static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_IS_OWNER, spec);
+  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_READER_IS_OWNER, spec);
 }
 
 /**
@@ -1296,6 +1300,173 @@ gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, GError
   }
 }
 
+typedef struct GAFlightRecordBatchWriterPrivate_
+{
+  arrow::flight::MetadataRecordBatchWriter *writer;
+  bool is_owner;
+} GAFlightRecordBatchWriterPrivate;
+
+enum {
+  PROP_RECORD_BATCH_WRITER_WRITER = 1,
+  PROP_RECORD_BATCH_WRITER_IS_OWNER,
+};
+
+G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GAFlightRecordBatchWriter,
+                                    gaflight_record_batch_writer,
+                                    GARROW_TYPE_RECORD_BATCH_WRITER)
+
+#define GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object)                                 \
+  static_cast<GAFlightRecordBatchWriterPrivate *>(                                       \
+    gaflight_record_batch_writer_get_instance_private(                                   \
+      GAFLIGHT_RECORD_BATCH_WRITER(object)))
+
+static void
+gaflight_record_batch_writer_finalize(GObject *object)
+{
+  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object);
+  if (priv->is_owner) {
+    delete priv->writer;
+  }
+  G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object);
+}
+
+static void
+gaflight_record_batch_writer_set_property(GObject *object,
+                                          guint prop_id,
+                                          const GValue *value,
+                                          GParamSpec *pspec)
+{
+  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_RECORD_BATCH_WRITER_WRITER:
+    priv->writer =
+      static_cast<arrow::flight::MetadataRecordBatchWriter *>(g_value_get_pointer(value));
+    break;
+  case PROP_RECORD_BATCH_WRITER_IS_OWNER:
+    priv->is_owner = g_value_get_boolean(value);
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gaflight_record_batch_writer_init(GAFlightRecordBatchWriter *object)
+{
+}
+
+static void
+gaflight_record_batch_writer_class_init(GAFlightRecordBatchWriterClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->finalize = gaflight_record_batch_writer_finalize;
+  gobject_class->set_property = gaflight_record_batch_writer_set_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer(
+    "writer",
+    nullptr,
+    nullptr,
+    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_WRITER, spec);
+
+  spec = g_param_spec_boolean(
+    "is-owner",
+    nullptr,
+    nullptr,
+    TRUE,
+    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_IS_OWNER, spec);
+}
+
+/**
+ * gaflight_record_batch_writer_begin:
+ * @writer: A #GAFlightRecordBatchWriter.
+ * @schema: A #GArrowSchema.
+ * @options: (nullable): A #GArrowWriteOptions.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Begins writing data with the given schema. Only used with
+ * `DoExchange`.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gaflight_record_batch_writer_begin(GAFlightRecordBatchWriter *writer,
+                                   GArrowSchema *schema,
+                                   GArrowWriteOptions *options,
+                                   GError **error)
+{
+  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto arrow_schema = garrow_schema_get_raw(schema);
+  arrow::ipc::IpcWriteOptions arrow_write_options;
+  if (options) {
+    arrow_write_options = *garrow_write_options_get_raw(options);
+  } else {
+    arrow_write_options = arrow::ipc::IpcWriteOptions::Defaults();
+  }
+  return garrow::check(error,
+                       flight_writer->Begin(arrow_schema, arrow_write_options),
+                       "[flight-record-batch-writer][begin]");
+}
+
+/**
+ * gaflight_record_batch_writer_write_metadata:
+ * @writer: A #GAFlightRecordBatchWriter.
+ * @metadata: A #GArrowBuffer.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Write metadata.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
+                                            GArrowBuffer *metadata,
+                                            GError **error)
+{
+  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto arrow_metadata = garrow_buffer_get_raw(metadata);
+  return garrow::check(error,
+                       flight_writer->WriteMetadata(arrow_metadata),
+                       "[flight-record-batch-writer][write-metadata]");
+}
+
+/**
+ * gaflight_record_batch_writer_write:
+ * @writer: A #GAFlightRecordBatchWriter.
+ * @record_batch: A #GArrowRecordBatch.
+ * @metadata: (nullable): A #GArrowBuffer.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Write a record batch with metadata.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer,
+                                   GArrowRecordBatch *record_batch,
+                                   GArrowBuffer *metadata,
+                                   GError **error)
+{
+  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto arrow_record_batch = garrow_record_batch_get_raw(record_batch);
+  auto arrow_metadata = garrow_buffer_get_raw(metadata);
+  return garrow::check(
+    error,
+    flight_writer->WriteWithMetadata(*arrow_record_batch, arrow_metadata),
+    "[flight-record-batch-writer][write]");
+}
+
 G_END_DECLS
 
 GAFlightCriteria *
@@ -1428,3 +1599,10 @@ gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader)
   auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(reader);
   return priv->reader;
 }
+
+arrow::flight::MetadataRecordBatchWriter *
+gaflight_record_batch_writer_get_raw(GAFlightRecordBatchWriter *writer)
+{
+  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(writer);
+  return priv->writer;
+}
diff --git a/c_glib/arrow-flight-glib/common.h b/c_glib/arrow-flight-glib/common.h
index b1d89f79c35..91c828caabb 100644
--- a/c_glib/arrow-flight-glib/common.h
+++ b/c_glib/arrow-flight-glib/common.h
@@ -232,4 +232,36 @@ GAFLIGHT_AVAILABLE_IN_6_0
 GArrowTable *
 gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, GError **error);
 
+#define GAFLIGHT_TYPE_RECORD_BATCH_WRITER (gaflight_record_batch_writer_get_type())
+GAFLIGHT_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(GAFlightRecordBatchWriter,
+                         gaflight_record_batch_writer,
+                         GAFLIGHT,
+                         RECORD_BATCH_WRITER,
+                         GArrowRecordBatchWriter)
+struct _GAFlightRecordBatchWriterClass
+{
+  GArrowRecordBatchWriterClass parent_class;
+};
+
+GAFLIGHT_AVAILABLE_IN_18_0
+gboolean
+gaflight_record_batch_writer_begin(GAFlightRecordBatchWriter *writer,
+                                   GArrowSchema *schema,
+                                   GArrowWriteOptions *options,
+                                   GError **error);
+
+GAFLIGHT_AVAILABLE_IN_18_0
+gboolean
+gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
+                                            GArrowBuffer *metadata,
+                                            GError **error);
+
+GAFLIGHT_AVAILABLE_IN_18_0
+gboolean
+gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer,
+                                   GArrowRecordBatch *record_batch,
+                                   GArrowBuffer *metadata,
+                                   GError **error);
+
 G_END_DECLS
diff --git a/c_glib/arrow-flight-glib/common.hpp b/c_glib/arrow-flight-glib/common.hpp
index db56fff579b..ae5a7703397 100644
--- a/c_glib/arrow-flight-glib/common.hpp
+++ b/c_glib/arrow-flight-glib/common.hpp
@@ -79,3 +79,7 @@ gaflight_stream_chunk_get_raw(GAFlightStreamChunk *chunk);
 GAFLIGHT_EXTERN
 arrow::flight::MetadataRecordBatchReader *
 gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader);
+
+GAFLIGHT_EXTERN
+arrow::flight::MetadataRecordBatchWriter *
+gaflight_record_batch_writer_get_raw(GAFlightRecordBatchWriter *writer);

From 146b4e9669071984c883ec5791676638014bd655 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 24 Aug 2024 06:22:26 +0900
Subject: [PATCH 067/157] GH-43743: [CI][Docs] Ensure creating build directory
 (#43744)

### Rationale for this change

It's used as a volume. If it doesn't exist, `docker compose` reports an error:

    Error response from daemon: invalid mount config for type "bind": bind source path does not exist: /home/runner/work/crossbow/crossbow/build/

### What changes are included in this PR?

* Create build directory
* Move required `-v $PWD/build/:/build/` to `docs/github.linux.yml`

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #43743

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/tasks/docs/github.linux.yml | 4 +++-
 dev/tasks/tasks.yml             | 4 +---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev/tasks/docs/github.linux.yml b/dev/tasks/docs/github.linux.yml
index 8ab8a593c3e..5863d68d2c8 100644
--- a/dev/tasks/docs/github.linux.yml
+++ b/dev/tasks/docs/github.linux.yml
@@ -34,8 +34,10 @@ jobs:
         env:
           ARROW_JAVA_SKIP_GIT_PLUGIN: true
         run: |
+          mkdir -p build
           archery docker run \
             -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \
+            -v $PWD/build/:/build/ \
             {{ flags|default("") }} \
             {{ image }} \
             {{ command|default("") }}
@@ -45,7 +47,7 @@ jobs:
           ref: {{ default_branch|default("main") }}
           path: crossbow
           fetch-depth: 1
-      {% if  publish %}
+      {% if publish %}
       - name: Prepare Docs Preview
         run: |
           # build files are created by the docker user
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 60114d69308..cae34c32313 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1487,7 +1487,7 @@ tasks:
       image: debian-go
   {% endfor %}
 
-  # be sure to update binary-task.rb when upgrading ubuntu
+  # be sure to update binary-task.rb when upgrading Debian
   test-debian-12-docs:
     ci: github
     template: docs/github.linux.yml
@@ -1495,7 +1495,6 @@ tasks:
       env:
         JDK: 17
       pr_number: Unset
-      flags: "-v $PWD/build/:/build/"
       image: debian-docs
       publish: false
     artifacts:
@@ -1621,6 +1620,5 @@ tasks:
       env:
         JDK: 17
       pr_number: Unset
-      flags: "-v $PWD/build/:/build/"
       image: debian-docs
       publish: true

From e61c105c73dfabb51d5afc972ff21cc5326b3d93 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Sat, 24 Aug 2024 07:07:09 +0530
Subject: [PATCH 068/157] GH-41584: [Java] ListView Implementation for C Data
 Interface (#43686)

### Rationale for this change

C Data Interface is missing `ListView` and `LargeListView` after recently merging core functionalities.

Also closes;

- [x] https://github.com/apache/arrow/issues/41585

### What changes are included in this PR?

This PR includes C Data interface related component additions to `ListView` and `LargeListView` along with the corresponding test cases.

### Are these changes tested?

Yes

### Are there any user-facing changes?

No
* GitHub Issue: #41584

Authored-by: Vibhatha Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 dev/archery/archery/integration/datagen.py    |   1 -
 .../arrow/c/BufferImportTypeVisitor.java      |  14 +-
 .../main/java/org/apache/arrow/c/Format.java  |   8 ++
 .../org/apache/arrow/c/RoundtripTest.java     |  42 ++++++
 java/c/src/test/python/integration_tests.py   |  47 ++++++
 .../BaseLargeRepeatedValueViewVector.java     |  29 ++--
 .../complex/BaseRepeatedValueViewVector.java  |  30 ++--
 .../vector/complex/LargeListViewVector.java   |  10 +-
 .../arrow/vector/complex/ListViewVector.java  |   6 +-
 .../arrow/vector/TestLargeListViewVector.java | 134 ++++++++++++++++++
 .../arrow/vector/TestListViewVector.java      | 132 +++++++++++++++++
 .../testing/ValueVectorDataPopulator.java     |  34 +++++
 12 files changed, 451 insertions(+), 36 deletions(-)

diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index 47310c905a9..d395d26cb71 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -1936,7 +1936,6 @@ def _temp_path():
 
         generate_list_view_case()
         .skip_tester('C#')     # Doesn't support large list views
-        .skip_tester('Java')
         .skip_tester('JS')
         .skip_tester('nanoarrow')
         .skip_tester('Rust'),
diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
index 633ecd43bd5..93fef6d7ca8 100644
--- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
+++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
@@ -47,7 +47,9 @@
 import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.complex.DenseUnionVector;
 import org.apache.arrow.vector.complex.LargeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.MapVector;
 import org.apache.arrow.vector.complex.UnionVector;
 import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
@@ -400,13 +402,17 @@ public List<ArrowBuf> visit(ArrowType.Duration type) {
 
   @Override
   public List<ArrowBuf> visit(ArrowType.ListView type) {
-    throw new UnsupportedOperationException(
-        "Importing buffers for view type: " + type + " not supported");
+    return Arrays.asList(
+        maybeImportBitmap(type),
+        importFixedBytes(type, 1, ListViewVector.OFFSET_WIDTH),
+        importFixedBytes(type, 2, ListViewVector.SIZE_WIDTH));
   }
 
   @Override
   public List<ArrowBuf> visit(ArrowType.LargeListView type) {
-    throw new UnsupportedOperationException(
-        "Importing buffers for view type: " + type + " not supported");
+    return Arrays.asList(
+        maybeImportBitmap(type),
+        importFixedBytes(type, 1, LargeListViewVector.OFFSET_WIDTH),
+        importFixedBytes(type, 2, LargeListViewVector.SIZE_WIDTH));
   }
 }
diff --git a/java/c/src/main/java/org/apache/arrow/c/Format.java b/java/c/src/main/java/org/apache/arrow/c/Format.java
index aff51e7b734..f77a555d184 100644
--- a/java/c/src/main/java/org/apache/arrow/c/Format.java
+++ b/java/c/src/main/java/org/apache/arrow/c/Format.java
@@ -229,6 +229,10 @@ static String asString(ArrowType arrowType) {
         return "vu";
       case BinaryView:
         return "vz";
+      case ListView:
+        return "+vl";
+      case LargeListView:
+        return "+vL";
       case NONE:
         throw new IllegalArgumentException("Arrow type ID is NONE");
       default:
@@ -313,6 +317,10 @@ static ArrowType asType(String format, long flags)
         return new ArrowType.Utf8View();
       case "vz":
         return new ArrowType.BinaryView();
+      case "+vl":
+        return new ArrowType.ListView();
+      case "+vL":
+        return new ArrowType.LargeListView();
       default:
         String[] parts = format.split(":", 2);
         if (parts.length == 2) {
diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
index 6591d1f7309..18b2e94adde 100644
--- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
+++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
@@ -84,7 +84,9 @@
 import org.apache.arrow.vector.compare.VectorEqualsVisitor;
 import org.apache.arrow.vector.complex.FixedSizeListVector;
 import org.apache.arrow.vector.complex.LargeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.MapVector;
 import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.complex.UnionVector;
@@ -683,6 +685,46 @@ public void testFixedSizeListVector() {
     }
   }
 
+  @Test
+  public void testListViewVector() {
+    try (final ListViewVector vector = ListViewVector.empty("v", allocator)) {
+      setVector(
+          vector,
+          Arrays.stream(new int[] {1, 2}).boxed().collect(Collectors.toList()),
+          Arrays.stream(new int[] {3, 4}).boxed().collect(Collectors.toList()),
+          new ArrayList<Integer>());
+      assertTrue(roundtrip(vector, ListViewVector.class));
+    }
+  }
+
+  @Test
+  public void testEmptyListViewVector() {
+    try (final ListViewVector vector = ListViewVector.empty("v", allocator)) {
+      setVector(vector, new ArrayList<Integer>());
+      assertTrue(roundtrip(vector, ListViewVector.class));
+    }
+  }
+
+  @Test
+  public void testLargeListViewVector() {
+    try (final LargeListViewVector vector = LargeListViewVector.empty("v", allocator)) {
+      setVector(
+          vector,
+          Arrays.stream(new int[] {1, 2}).boxed().collect(Collectors.toList()),
+          Arrays.stream(new int[] {3, 4}).boxed().collect(Collectors.toList()),
+          new ArrayList<Integer>());
+      assertTrue(roundtrip(vector, LargeListViewVector.class));
+    }
+  }
+
+  @Test
+  public void testEmptyLargeListViewVector() {
+    try (final LargeListViewVector vector = LargeListViewVector.empty("v", allocator)) {
+      setVector(vector, new ArrayList<Integer>());
+      assertTrue(roundtrip(vector, LargeListViewVector.class));
+    }
+  }
+
   @Test
   public void testMapVector() {
     int count = 5;
diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py
index ab2ee1742f3..b0a86e9c66e 100644
--- a/java/c/src/test/python/integration_tests.py
+++ b/java/c/src/test/python/integration_tests.py
@@ -352,6 +352,53 @@ def test_reader_complex_roundtrip(self):
         ]
         self.round_trip_reader(schema, data)
 
+    def test_listview_array(self):
+        self.round_trip_array(lambda: pa.array(
+            [[], [0], [1, 2], [4, 5, 6]], pa.list_view(pa.int64())
+            # disabled check_metadata since in Java API the listview
+            # internal field name ("item") is not preserved 
+            # during round trips (it becomes "$data$").
+        ), check_metadata=False)
+
+    def test_empty_listview_array(self):
+        with pa.BufferOutputStream() as bos:
+            schema = pa.schema([pa.field("f0", pa.list_view(pa.int32()), True)])
+            with ipc.new_stream(bos, schema) as writer:
+                src = pa.RecordBatch.from_arrays(
+                    [pa.array([[]], pa.list_view(pa.int32()))], schema=schema)
+                writer.write(src)
+        data_bytes = bos.getvalue()
+
+        def recreate_batch():
+            with pa.input_stream(data_bytes) as ios:
+                with ipc.open_stream(ios) as reader:
+                    return reader.read_next_batch()
+
+        self.round_trip_record_batch(recreate_batch)
+
+    def test_largelistview_array(self):
+        self.round_trip_array(lambda: pa.array(
+            [[], [0], [1, 2], [4, 5, 6]], pa.large_list_view(pa.int64())
+            # disabled check_metadata since in Java API the listview
+            # internal field name ("item") is not preserved
+            # during round trips (it becomes "$data$").
+        ), check_metadata=False)
+
+    def test_empty_largelistview_array(self):
+        with pa.BufferOutputStream() as bos:
+            schema = pa.schema([pa.field("f0", pa.large_list_view(pa.int32()), True)])
+            with ipc.new_stream(bos, schema) as writer:
+                src = pa.RecordBatch.from_arrays(
+                    [pa.array([[]], pa.large_list_view(pa.int32()))], schema=schema)
+                writer.write(src)
+        data_bytes = bos.getvalue()
+
+        def recreate_batch():
+            with pa.input_stream(data_bytes) as ios:
+                with ipc.open_stream(ios) as reader:
+                    return reader.read_next_batch()
+
+        self.round_trip_record_batch(recreate_batch)
 
 if __name__ == '__main__':
     unittest.main(verbosity=2)
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java
index f643306cfdc..12edd6557bd 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java
@@ -305,38 +305,43 @@ public void setValueCount(int valueCount) {
     while (valueCount > getOffsetBufferValueCapacity()) {
       reallocateBuffers();
     }
-    final int childValueCount = valueCount == 0 ? 0 : getLengthOfChildVector();
+    final int childValueCount = valueCount == 0 ? 0 : getMaxViewEndChildVector();
     vector.setValueCount(childValueCount);
   }
 
-  protected int getLengthOfChildVector() {
+  /**
+   * Get the end of the child vector via the maximum view length. This method deduces the length by
+   * considering the condition i.e., argmax_i(offsets[i] + size[i]).
+   *
+   * @return the end of the child vector.
+   */
+  protected int getMaxViewEndChildVector() {
     int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0);
-    int minOffset = offsetBuffer.getInt(0);
     for (int i = 0; i < valueCount; i++) {
       int currentOffset = offsetBuffer.getInt((long) i * OFFSET_WIDTH);
       int currentSize = sizeBuffer.getInt((long) i * SIZE_WIDTH);
       int currentSum = currentOffset + currentSize;
-
       maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum);
-      minOffset = Math.min(minOffset, currentOffset);
     }
 
-    return maxOffsetSizeSum - minOffset;
+    return maxOffsetSizeSum;
   }
 
-  protected int getLengthOfChildVectorByIndex(int index) {
+  /**
+   * Get the end of the child vector via the maximum view length of the child vector by index.
+   *
+   * @return the end of the child vector by index
+   */
+  protected int getMaxViewEndChildVectorByIndex(int index) {
     int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0);
-    int minOffset = offsetBuffer.getInt(0);
     for (int i = 0; i < index; i++) {
       int currentOffset = offsetBuffer.getInt((long) i * OFFSET_WIDTH);
       int currentSize = sizeBuffer.getInt((long) i * SIZE_WIDTH);
       int currentSum = currentOffset + currentSize;
-
       maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum);
-      minOffset = Math.min(minOffset, currentOffset);
     }
 
-    return maxOffsetSizeSum - minOffset;
+    return maxOffsetSizeSum;
   }
 
   /**
@@ -390,7 +395,7 @@ public int startNewValue(int index) {
     }
 
     if (index > 0) {
-      final int prevOffset = getLengthOfChildVectorByIndex(index);
+      final int prevOffset = getMaxViewEndChildVectorByIndex(index);
       offsetBuffer.setInt((long) index * OFFSET_WIDTH, prevOffset);
     }
 
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java
index 031cc8037bb..e6213316b55 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java
@@ -304,38 +304,44 @@ public void setValueCount(int valueCount) {
     while (valueCount > getOffsetBufferValueCapacity()) {
       reallocateBuffers();
     }
-    final int childValueCount = valueCount == 0 ? 0 : getLengthOfChildVector();
+    final int childValueCount = valueCount == 0 ? 0 : getMaxViewEndChildVector();
     vector.setValueCount(childValueCount);
   }
 
-  protected int getLengthOfChildVector() {
+  /**
+   * Get the end of the child vector via the maximum view length. This method deduces the length by
+   * considering the condition i.e., argmax_i(offsets[i] + size[i]).
+   *
+   * @return the end of the child vector.
+   */
+  protected int getMaxViewEndChildVector() {
     int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0);
-    int minOffset = offsetBuffer.getInt(0);
     for (int i = 0; i < valueCount; i++) {
       int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH);
       int currentSize = sizeBuffer.getInt(i * SIZE_WIDTH);
       int currentSum = currentOffset + currentSize;
-
       maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum);
-      minOffset = Math.min(minOffset, currentOffset);
     }
 
-    return maxOffsetSizeSum - minOffset;
+    return maxOffsetSizeSum;
   }
 
-  protected int getLengthOfChildVectorByIndex(int index) {
+  /**
+   * Get the end of the child vector via the maximum view length of the child vector by index.
+   *
+   * @return the end of the child vector by index
+   */
+  protected int getMaxViewEndChildVectorByIndex(int index) {
     int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0);
-    int minOffset = offsetBuffer.getInt(0);
+    // int minOffset = offsetBuffer.getInt(0);
     for (int i = 0; i < index; i++) {
       int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH);
       int currentSize = sizeBuffer.getInt(i * SIZE_WIDTH);
       int currentSum = currentOffset + currentSize;
-
       maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum);
-      minOffset = Math.min(minOffset, currentOffset);
     }
 
-    return maxOffsetSizeSum - minOffset;
+    return maxOffsetSizeSum;
   }
 
   /**
@@ -389,7 +395,7 @@ public int startNewValue(int index) {
     }
 
     if (index > 0) {
-      final int prevOffset = getLengthOfChildVectorByIndex(index);
+      final int prevOffset = getMaxViewEndChildVectorByIndex(index);
       offsetBuffer.setInt(index * OFFSET_WIDTH, prevOffset);
     }
 
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
index 2c61f799a4c..84c6f03edb2 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
@@ -250,7 +250,9 @@ public List<ArrowBuf> getFieldBuffers() {
    */
   @Override
   public void exportCDataBuffers(List<ArrowBuf> buffers, ArrowBuf buffersPtr, long nullValue) {
-    throw new UnsupportedOperationException("exportCDataBuffers Not implemented yet");
+    exportBuffer(validityBuffer, buffers, buffersPtr, nullValue, true);
+    exportBuffer(offsetBuffer, buffers, buffersPtr, nullValue, true);
+    exportBuffer(sizeBuffer, buffers, buffersPtr, nullValue, true);
   }
 
   @Override
@@ -851,7 +853,7 @@ public int startNewValue(int index) {
     }
 
     if (index > 0) {
-      final int prevOffset = getLengthOfChildVectorByIndex(index);
+      final int prevOffset = getMaxViewEndChildVectorByIndex(index);
       offsetBuffer.setInt(index * OFFSET_WIDTH, prevOffset);
     }
 
@@ -943,7 +945,7 @@ public void setValueCount(int valueCount) {
       }
     }
     /* valueCount for the data vector is the current end offset */
-    final long childValueCount = (valueCount == 0) ? 0 : getLengthOfChildVector();
+    final long childValueCount = (valueCount == 0) ? 0 : getMaxViewEndChildVector();
     /* set the value count of data vector and this will take care of
      * checking whether data buffer needs to be reallocated.
      * TODO: revisit when 64-bit vectors are supported
@@ -1001,7 +1003,7 @@ public double getDensity() {
     if (valueCount == 0) {
       return 0.0D;
     }
-    final double totalListSize = getLengthOfChildVector();
+    final double totalListSize = getMaxViewEndChildVector();
     return totalListSize / valueCount;
   }
 
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java
index 7f6d92f3be9..9b4e6b4c0cd 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java
@@ -858,7 +858,7 @@ public int startNewValue(int index) {
     }
 
     if (index > 0) {
-      final int prevOffset = getLengthOfChildVectorByIndex(index);
+      final int prevOffset = getMaxViewEndChildVectorByIndex(index);
       offsetBuffer.setInt(index * OFFSET_WIDTH, prevOffset);
     }
 
@@ -942,7 +942,7 @@ public void setValueCount(int valueCount) {
       }
     }
     /* valueCount for the data vector is the current end offset */
-    final int childValueCount = (valueCount == 0) ? 0 : getLengthOfChildVector();
+    final int childValueCount = (valueCount == 0) ? 0 : getMaxViewEndChildVector();
     /* set the value count of data vector and this will take care of
      * checking whether data buffer needs to be reallocated.
      */
@@ -1005,7 +1005,7 @@ public double getDensity() {
     if (valueCount == 0) {
       return 0.0D;
     }
-    final double totalListSize = getLengthOfChildVector();
+    final double totalListSize = getMaxViewEndChildVector();
     return totalListSize / valueCount;
   }
 
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java
index 2ed8d4d7005..26e7bb4a0d3 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java
@@ -2095,6 +2095,140 @@ public void testOutOfOrderOffsetSplitAndTransfer() {
     }
   }
 
+  @Test
+  public void testRangeChildVector1() {
+    /*
+     * Non-overlapping ranges
+     * offsets: [0, 2]
+     * sizes: [4, 1]
+     * values: [0, 1, 2, 3]
+     *
+     * vector: [[0, 1, 2, 3], [2]]
+     * */
+    try (LargeListViewVector largeListViewVector =
+        LargeListViewVector.empty("largelistview", allocator)) {
+      // Allocate buffers in listViewVector by calling `allocateNew` method.
+      largeListViewVector.allocateNew();
+
+      // Initialize the child vector using `initializeChildrenFromFields` method.
+
+      FieldType fieldType = new FieldType(true, new ArrowType.Int(32, true), null, null);
+      Field field = new Field("child-vector", fieldType, null);
+      largeListViewVector.initializeChildrenFromFields(Collections.singletonList(field));
+
+      // Set values in the child vector.
+      FieldVector fieldVector = largeListViewVector.getDataVector();
+      fieldVector.clear();
+
+      IntVector childVector = (IntVector) fieldVector;
+
+      childVector.allocateNew(8);
+
+      childVector.set(0, 0);
+      childVector.set(1, 1);
+      childVector.set(2, 2);
+      childVector.set(3, 3);
+      childVector.set(4, 4);
+      childVector.set(5, 5);
+      childVector.set(6, 6);
+      childVector.set(7, 7);
+
+      childVector.setValueCount(8);
+
+      // Set validity, offset and size buffers using `setValidity`,
+      //  `setOffset` and `setSize` methods.
+      largeListViewVector.setValidity(0, 1);
+      largeListViewVector.setValidity(1, 1);
+
+      largeListViewVector.setOffset(0, 0);
+      largeListViewVector.setOffset(1, 2);
+
+      largeListViewVector.setSize(0, 4);
+      largeListViewVector.setSize(1, 1);
+
+      assertEquals(8, largeListViewVector.getDataVector().getValueCount());
+
+      largeListViewVector.setValueCount(2);
+      assertEquals(4, largeListViewVector.getDataVector().getValueCount());
+
+      IntVector childVector1 = (IntVector) largeListViewVector.getDataVector();
+      final ArrowBuf dataBuffer = childVector1.getDataBuffer();
+      final ArrowBuf validityBuffer = childVector1.getValidityBuffer();
+
+      // yet the underneath buffer contains the original buffer
+      for (int i = 0; i < validityBuffer.capacity(); i++) {
+        assertEquals(i, dataBuffer.getInt((long) i * IntVector.TYPE_WIDTH));
+      }
+    }
+  }
+
+  @Test
+  public void testRangeChildVector2() {
+    /*
+     * Overlapping ranges
+     * offsets: [0, 2]
+     * sizes: [3, 1]
+     * values: [0, 1, 2, 3]
+     *
+     * vector: [[1, 2, 3], [2]]
+     * */
+    try (LargeListViewVector largeListViewVector =
+        LargeListViewVector.empty("largelistview", allocator)) {
+      // Allocate buffers in listViewVector by calling `allocateNew` method.
+      largeListViewVector.allocateNew();
+
+      // Initialize the child vector using `initializeChildrenFromFields` method.
+
+      FieldType fieldType = new FieldType(true, new ArrowType.Int(32, true), null, null);
+      Field field = new Field("child-vector", fieldType, null);
+      largeListViewVector.initializeChildrenFromFields(Collections.singletonList(field));
+
+      // Set values in the child vector.
+      FieldVector fieldVector = largeListViewVector.getDataVector();
+      fieldVector.clear();
+
+      IntVector childVector = (IntVector) fieldVector;
+
+      childVector.allocateNew(8);
+
+      childVector.set(0, 0);
+      childVector.set(1, 1);
+      childVector.set(2, 2);
+      childVector.set(3, 3);
+      childVector.set(4, 4);
+      childVector.set(5, 5);
+      childVector.set(6, 6);
+      childVector.set(7, 7);
+
+      childVector.setValueCount(8);
+
+      // Set validity, offset and size buffers using `setValidity`,
+      //  `setOffset` and `setSize` methods.
+      largeListViewVector.setValidity(0, 1);
+      largeListViewVector.setValidity(1, 1);
+
+      largeListViewVector.setOffset(0, 1);
+      largeListViewVector.setOffset(1, 2);
+
+      largeListViewVector.setSize(0, 3);
+      largeListViewVector.setSize(1, 1);
+
+      assertEquals(8, largeListViewVector.getDataVector().getValueCount());
+
+      largeListViewVector.setValueCount(2);
+      assertEquals(4, largeListViewVector.getDataVector().getValueCount());
+
+      IntVector childVector1 = (IntVector) largeListViewVector.getDataVector();
+      final ArrowBuf dataBuffer = childVector1.getDataBuffer();
+      final ArrowBuf validityBuffer = childVector1.getValidityBuffer();
+
+      // yet the underneath buffer contains the original buffer
+      for (int i = 0; i < validityBuffer.capacity(); i++) {
+        assertEquals(i, dataBuffer.getInt((long) i * IntVector.TYPE_WIDTH));
+      }
+    }
+  }
+
   private void writeIntValues(UnionLargeListViewWriter writer, int[] values) {
     writer.startListView();
     for (int v : values) {
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java
index 4fa808c18ae..639585fc48d 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java
@@ -2084,6 +2084,138 @@ public void testOutOfOrderOffsetSplitAndTransfer() {
     }
   }
 
+  @Test
+  public void testRangeChildVector1() {
+    /*
+     * Non-overlapping ranges
+     * offsets: [0, 2]
+     * sizes: [4, 1]
+     * values: [0, 1, 2, 3]
+     *
+     * vector: [[0, 1, 2, 3], [2]]
+     * */
+    try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) {
+      // Allocate buffers in listViewVector by calling `allocateNew` method.
+      listViewVector.allocateNew();
+
+      // Initialize the child vector using `initializeChildrenFromFields` method.
+
+      FieldType fieldType = new FieldType(true, new ArrowType.Int(32, true), null, null);
+      Field field = new Field("child-vector", fieldType, null);
+      listViewVector.initializeChildrenFromFields(Collections.singletonList(field));
+
+      // Set values in the child vector.
+      FieldVector fieldVector = listViewVector.getDataVector();
+      fieldVector.clear();
+
+      IntVector childVector = (IntVector) fieldVector;
+
+      childVector.allocateNew(8);
+
+      childVector.set(0, 0);
+      childVector.set(1, 1);
+      childVector.set(2, 2);
+      childVector.set(3, 3);
+      childVector.set(4, 4);
+      childVector.set(5, 5);
+      childVector.set(6, 6);
+      childVector.set(7, 7);
+
+      childVector.setValueCount(8);
+
+      // Set validity, offset and size buffers using `setValidity`,
+      //  `setOffset` and `setSize` methods.
+      listViewVector.setValidity(0, 1);
+      listViewVector.setValidity(1, 1);
+
+      listViewVector.setOffset(0, 0);
+      listViewVector.setOffset(1, 2);
+
+      listViewVector.setSize(0, 4);
+      listViewVector.setSize(1, 1);
+
+      assertEquals(8, listViewVector.getDataVector().getValueCount());
+
+      listViewVector.setValueCount(2);
+      assertEquals(4, listViewVector.getDataVector().getValueCount());
+
+      IntVector childVector1 = (IntVector) listViewVector.getDataVector();
+      final ArrowBuf dataBuffer = childVector1.getDataBuffer();
+      final ArrowBuf validityBuffer = childVector1.getValidityBuffer();
+
+      // yet the underneath buffer contains the original buffer
+      for (int i = 0; i < validityBuffer.capacity(); i++) {
+        assertEquals(i, dataBuffer.getInt((long) i * IntVector.TYPE_WIDTH));
+      }
+    }
+  }
+
+  @Test
+  public void testRangeChildVector2() {
+    /*
+     * Overlapping ranges
+     * offsets: [0, 2]
+     * sizes: [3, 1]
+     * values: [0, 1, 2, 3]
+     *
+     * vector: [[1, 2, 3], [2]]
+     * */
+    try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) {
+      // Allocate buffers in listViewVector by calling `allocateNew` method.
+      listViewVector.allocateNew();
+
+      // Initialize the child vector using `initializeChildrenFromFields` method.
+
+      FieldType fieldType = new FieldType(true, new ArrowType.Int(32, true), null, null);
+      Field field = new Field("child-vector", fieldType, null);
+      listViewVector.initializeChildrenFromFields(Collections.singletonList(field));
+
+      // Set values in the child vector.
+      FieldVector fieldVector = listViewVector.getDataVector();
+      fieldVector.clear();
+
+      IntVector childVector = (IntVector) fieldVector;
+
+      childVector.allocateNew(8);
+
+      childVector.set(0, 0);
+      childVector.set(1, 1);
+      childVector.set(2, 2);
+      childVector.set(3, 3);
+      childVector.set(4, 4);
+      childVector.set(5, 5);
+      childVector.set(6, 6);
+      childVector.set(7, 7);
+
+      childVector.setValueCount(8);
+
+      // Set validity, offset and size buffers using `setValidity`,
+      //  `setOffset` and `setSize` methods.
+      listViewVector.setValidity(0, 1);
+      listViewVector.setValidity(1, 1);
+
+      listViewVector.setOffset(0, 1);
+      listViewVector.setOffset(1, 2);
+
+      listViewVector.setSize(0, 3);
+      listViewVector.setSize(1, 1);
+
+      assertEquals(8, listViewVector.getDataVector().getValueCount());
+
+      listViewVector.setValueCount(2);
+      assertEquals(4, listViewVector.getDataVector().getValueCount());
+
+      IntVector childVector1 = (IntVector) listViewVector.getDataVector();
+      final ArrowBuf dataBuffer = childVector1.getDataBuffer();
+      final ArrowBuf validityBuffer = childVector1.getValidityBuffer();
+
+      // yet the underneath buffer contains the original buffer
+      for (int i = 0; i < validityBuffer.capacity(); i++) {
+        assertEquals(i, dataBuffer.getInt((long) i * IntVector.TYPE_WIDTH));
+      }
+    }
+  }
+
   private void writeIntValues(UnionListViewWriter writer, int[] values) {
     writer.startListView();
     for (int v : values) {
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java
index 69e16dc4703..afbc30f019e 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java
@@ -60,10 +60,12 @@
 import org.apache.arrow.vector.VarBinaryVector;
 import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.VariableWidthFieldVector;
+import org.apache.arrow.vector.complex.BaseLargeRepeatedValueViewVector;
 import org.apache.arrow.vector.complex.BaseRepeatedValueVector;
 import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector;
 import org.apache.arrow.vector.complex.FixedSizeListVector;
 import org.apache.arrow.vector.complex.LargeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.StructVector;
@@ -760,4 +762,36 @@ public static void setVector(ListViewVector vector, List<Integer>... values) {
     dataVector.setValueCount(curPos);
     vector.setValueCount(values.length);
   }
+
+  /** Populate values for {@link ListViewVector}. */
+  public static void setVector(LargeListViewVector vector, List<Integer>... values) {
+    vector.allocateNewSafe();
+    Types.MinorType type = Types.MinorType.INT;
+    vector.addOrGetVector(FieldType.nullable(type.getType()));
+
+    IntVector dataVector = (IntVector) vector.getDataVector();
+    dataVector.allocateNew();
+
+    // set underlying vectors
+    int curPos = 0;
+    for (int i = 0; i < values.length; i++) {
+      vector
+          .getOffsetBuffer()
+          .setInt((long) i * BaseLargeRepeatedValueViewVector.OFFSET_WIDTH, curPos);
+      if (values[i] == null) {
+        BitVectorHelper.unsetBit(vector.getValidityBuffer(), i);
+      } else {
+        BitVectorHelper.setBit(vector.getValidityBuffer(), i);
+        for (int value : values[i]) {
+          dataVector.setSafe(curPos, value);
+          curPos += 1;
+        }
+      }
+      vector
+          .getSizeBuffer()
+          .setInt((long) i * BaseRepeatedValueViewVector.SIZE_WIDTH, values[i].size());
+    }
+    dataVector.setValueCount(curPos);
+    vector.setValueCount(values.length);
+  }
 }

From 83d915a3d2ac2acecbb2cb2dc0dd7f5a213dd625 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 12:38:38 +0900
Subject: [PATCH 069/157] MINOR: [Java] Bump dep.slf4j.version from 2.0.13 to
 2.0.16 in /java (#43652)

Bumps `dep.slf4j.version` from 2.0.13 to 2.0.16.
Updates `org.slf4j:slf4j-api` from 2.0.13 to 2.0.16

Updates `org.slf4j:slf4j-jdk14` from 2.0.13 to 2.0.16

Updates `org.slf4j:jul-to-slf4j` from 2.0.13 to 2.0.16

Updates `org.slf4j:jcl-over-slf4j` from 2.0.13 to 2.0.16

Updates `org.slf4j:log4j-over-slf4j` from 2.0.13 to 2.0.16

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index a73453df68f..54bb7a0ae0e 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -94,7 +94,7 @@ under the License.
     <target.gen.source.path>${project.build.directory}/generated-sources</target.gen.source.path>
     <dep.junit.platform.version>1.9.0</dep.junit.platform.version>
     <dep.junit.jupiter.version>5.10.3</dep.junit.jupiter.version>
-    <dep.slf4j.version>2.0.13</dep.slf4j.version>
+    <dep.slf4j.version>2.0.16</dep.slf4j.version>
     <dep.guava-bom.version>33.2.1-jre</dep.guava-bom.version>
     <dep.netty-bom.version>4.1.112.Final</dep.netty-bom.version>
     <dep.grpc-bom.version>1.66.0</dep.grpc-bom.version>

From cbb5f96306972aa236750602aba4b40ceb4219c4 Mon Sep 17 00:00:00 2001
From: Bryce Mecum <petridish@gmail.com>
Date: Sun, 25 Aug 2024 21:33:51 -0700
Subject: [PATCH 070/157] MINOR: [R] Add missing PR num to news.md item
 (#43811)

### Rationale for this change

We normally link to somewhere to give the user more context on news items. I noticed the link was missing for this one.

### What changes are included in this PR?

Added PR number to news item.

### Are these changes tested?

No.

### Are there any user-facing changes?

No.

Authored-by: Bryce Mecum <petridish@gmail.com>
Signed-off-by: Jacob Wujciak-Jens <jacob@wujciak.de>
---
 r/NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index 0e6e4634a0a..b9568afe665 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -32,7 +32,7 @@
   functions (UDFs); for UDFs, see `register_scalar_function()`. (#41223)
 * `mutate()` expressions can now include aggregations, such as `x - mean(x)`. (#41350)
 * `summarize()` supports more complex expressions, and correctly handles cases
-  where column names are reused in expressions.
+  where column names are reused in expressions. (#41223)
 * The `na_matches` argument to the `dplyr::*_join()` functions is now supported.
   This argument controls whether `NA` values are considered equal when joining. (#41358)
 * R metadata, stored in the Arrow schema to support round-tripping data between

From 51e9f70f94cd09a0a08196afdd2f4fc644666b5e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 16:20:20 +0900
Subject: [PATCH 071/157] MINOR: [Java] Bump dep.junit.jupiter.version from
 5.10.3 to 5.11.0 in /java (#43751)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps `dep.junit.jupiter.version` from 5.10.3 to 5.11.0.
Updates `org.junit.jupiter:junit-jupiter-engine` from 5.10.3 to 5.11.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/junit-team/junit5/releases">org.junit.jupiter:junit-jupiter-engine's releases</a>.</em></p>
<blockquote>
<p>JUnit 5.11.0 = Platform 1.11.0 + Jupiter 5.11.0 + Vintage 5.11.0</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/pshevche"><code>@​pshevche</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3427">junit-team/junit5#3427</a></li>
<li><a href="https://github.com/rybak"><code>@​rybak</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3416">junit-team/junit5#3416</a></li>
<li><a href="https://github.com/pixeebot"><code>@​pixeebot</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3491">junit-team/junit5#3491</a></li>
<li><a href="https://github.com/shartte"><code>@​shartte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3562">junit-team/junit5#3562</a></li>
<li><a href="https://github.com/eliasnogueira"><code>@​eliasnogueira</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3674">junit-team/junit5#3674</a></li>
<li><a href="https://github.com/bigdaz"><code>@​bigdaz</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3668">junit-team/junit5#3668</a></li>
<li><a href="https://github.com/gilday"><code>@​gilday</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3667">junit-team/junit5#3667</a></li>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
<li><a href="https://github.com/rfscholte"><code>@​rfscholte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3909">junit-team/junit5#3909</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0</a></p>
<p>JUnit 5.11.0-RC1 = Platform 1.11.0-RC1 + Jupiter 5.11.0-RC1 + Vintage 5.11.0-RC1</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-RC1/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1">https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1</a></p>
<p>JUnit 5.11.0-M2 = Platform 1.11.0-M2 + Jupiter 5.11.0-M2 + Vintage 5.11.0-M2</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-M2/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2">https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2</a></p>
<p>JUnit 5.11.0-M1 = Platform 1.11.0-M1 + Jupiter 5.11.0-M1 + Vintage 5.11.0-M1</p>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/junit-team/junit5/commit/6b8e42b7a7d1606962341a61941c60b045646278"><code>6b8e42b</code></a> Release 5.11</li>
<li><a href="https://github.com/junit-team/junit5/commit/9430ecee6b99d9438c5a0204549ab88fc66ead86"><code>9430ece</code></a> Allow potentially unlimited maxCharsPerColumn in Csv{File}Source (<a href="https://redirect.github.com/junit-team/junit5/issues/3924">#3924</a>)</li>
<li><a href="https://github.com/junit-team/junit5/commit/0b10f86dd2e0a7fd232c1de032d1e2fbe312f615"><code>0b10f86</code></a> Polish release notes</li>
<li><a href="https://github.com/junit-team/junit5/commit/4dbd0f943efd53e49f8896ec1c9f677526c212cb"><code>4dbd0f9</code></a> Let <code>@ TempDir</code> fail fast with <code>File</code> annotated element and non-default file s...</li>
<li><a href="https://github.com/junit-team/junit5/commit/57f1ad4efd75236e531b9bcbad7c955eb1fb3943"><code>57f1ad4</code></a> Fix syntax</li>
<li><a href="https://github.com/junit-team/junit5/commit/d78730ae9f74bc63a136a29f5c5332154731c99b"><code>d78730a</code></a> Prioritize tasks on critical path of task graph</li>
<li><a href="https://github.com/junit-team/junit5/commit/b6719e2e05ea5001f25dc1628917d23d7e3e76dc"><code>b6719e2</code></a> Remove obsolete directory</li>
<li><a href="https://github.com/junit-team/junit5/commit/d8ec757357932e224ea081b1c8b9d993f143e75f"><code>d8ec757</code></a> Apply Spotless formatting to Gradle script plugins</li>
<li><a href="https://github.com/junit-team/junit5/commit/dae525d51c0811f69f3087b38f24fa9053a31d36"><code>dae525d</code></a> Disable caching of some Spotless tasks due to negative avoidance savings</li>
<li><a href="https://github.com/junit-team/junit5/commit/c63d11843506d908584ebde270d1b3b299417d54"><code>c63d118</code></a> Re-enable caching verifyOSGi tasks (issue was fixed in bnd 7.0.0)</li>
<li>Additional commits viewable in <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">compare view</a></li>
</ul>
</details>
<br />

Updates `org.junit.jupiter:junit-jupiter-api` from 5.10.3 to 5.11.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/junit-team/junit5/releases">org.junit.jupiter:junit-jupiter-api's releases</a>.</em></p>
<blockquote>
<p>JUnit 5.11.0 = Platform 1.11.0 + Jupiter 5.11.0 + Vintage 5.11.0</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/pshevche"><code>@​pshevche</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3427">junit-team/junit5#3427</a></li>
<li><a href="https://github.com/rybak"><code>@​rybak</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3416">junit-team/junit5#3416</a></li>
<li><a href="https://github.com/pixeebot"><code>@​pixeebot</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3491">junit-team/junit5#3491</a></li>
<li><a href="https://github.com/shartte"><code>@​shartte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3562">junit-team/junit5#3562</a></li>
<li><a href="https://github.com/eliasnogueira"><code>@​eliasnogueira</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3674">junit-team/junit5#3674</a></li>
<li><a href="https://github.com/bigdaz"><code>@​bigdaz</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3668">junit-team/junit5#3668</a></li>
<li><a href="https://github.com/gilday"><code>@​gilday</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3667">junit-team/junit5#3667</a></li>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
<li><a href="https://github.com/rfscholte"><code>@​rfscholte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3909">junit-team/junit5#3909</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0</a></p>
<p>JUnit 5.11.0-RC1 = Platform 1.11.0-RC1 + Jupiter 5.11.0-RC1 + Vintage 5.11.0-RC1</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-RC1/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1">https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1</a></p>
<p>JUnit 5.11.0-M2 = Platform 1.11.0-M2 + Jupiter 5.11.0-M2 + Vintage 5.11.0-M2</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-M2/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2">https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2</a></p>
<p>JUnit 5.11.0-M1 = Platform 1.11.0-M1 + Jupiter 5.11.0-M1 + Vintage 5.11.0-M1</p>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/junit-team/junit5/commit/6b8e42b7a7d1606962341a61941c60b045646278"><code>6b8e42b</code></a> Release 5.11</li>
<li><a href="https://github.com/junit-team/junit5/commit/9430ecee6b99d9438c5a0204549ab88fc66ead86"><code>9430ece</code></a> Allow potentially unlimited maxCharsPerColumn in Csv{File}Source (<a href="https://redirect.github.com/junit-team/junit5/issues/3924">#3924</a>)</li>
<li><a href="https://github.com/junit-team/junit5/commit/0b10f86dd2e0a7fd232c1de032d1e2fbe312f615"><code>0b10f86</code></a> Polish release notes</li>
<li><a href="https://github.com/junit-team/junit5/commit/4dbd0f943efd53e49f8896ec1c9f677526c212cb"><code>4dbd0f9</code></a> Let <code>@ TempDir</code> fail fast with <code>File</code> annotated element and non-default file s...</li>
<li><a href="https://github.com/junit-team/junit5/commit/57f1ad4efd75236e531b9bcbad7c955eb1fb3943"><code>57f1ad4</code></a> Fix syntax</li>
<li><a href="https://github.com/junit-team/junit5/commit/d78730ae9f74bc63a136a29f5c5332154731c99b"><code>d78730a</code></a> Prioritize tasks on critical path of task graph</li>
<li><a href="https://github.com/junit-team/junit5/commit/b6719e2e05ea5001f25dc1628917d23d7e3e76dc"><code>b6719e2</code></a> Remove obsolete directory</li>
<li><a href="https://github.com/junit-team/junit5/commit/d8ec757357932e224ea081b1c8b9d993f143e75f"><code>d8ec757</code></a> Apply Spotless formatting to Gradle script plugins</li>
<li><a href="https://github.com/junit-team/junit5/commit/dae525d51c0811f69f3087b38f24fa9053a31d36"><code>dae525d</code></a> Disable caching of some Spotless tasks due to negative avoidance savings</li>
<li><a href="https://github.com/junit-team/junit5/commit/c63d11843506d908584ebde270d1b3b299417d54"><code>c63d118</code></a> Re-enable caching verifyOSGi tasks (issue was fixed in bnd 7.0.0)</li>
<li>Additional commits viewable in <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">compare view</a></li>
</ul>
</details>
<br />

Updates `org.junit.jupiter:junit-jupiter-params` from 5.10.3 to 5.11.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/junit-team/junit5/releases">org.junit.jupiter:junit-jupiter-params's releases</a>.</em></p>
<blockquote>
<p>JUnit 5.11.0 = Platform 1.11.0 + Jupiter 5.11.0 + Vintage 5.11.0</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/pshevche"><code>@​pshevche</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3427">junit-team/junit5#3427</a></li>
<li><a href="https://github.com/rybak"><code>@​rybak</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3416">junit-team/junit5#3416</a></li>
<li><a href="https://github.com/pixeebot"><code>@​pixeebot</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3491">junit-team/junit5#3491</a></li>
<li><a href="https://github.com/shartte"><code>@​shartte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3562">junit-team/junit5#3562</a></li>
<li><a href="https://github.com/eliasnogueira"><code>@​eliasnogueira</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3674">junit-team/junit5#3674</a></li>
<li><a href="https://github.com/bigdaz"><code>@​bigdaz</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3668">junit-team/junit5#3668</a></li>
<li><a href="https://github.com/gilday"><code>@​gilday</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3667">junit-team/junit5#3667</a></li>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
<li><a href="https://github.com/rfscholte"><code>@​rfscholte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3909">junit-team/junit5#3909</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0</a></p>
<p>JUnit 5.11.0-RC1 = Platform 1.11.0-RC1 + Jupiter 5.11.0-RC1 + Vintage 5.11.0-RC1</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-RC1/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1">https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1</a></p>
<p>JUnit 5.11.0-M2 = Platform 1.11.0-M2 + Jupiter 5.11.0-M2 + Vintage 5.11.0-M2</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-M2/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2">https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2</a></p>
<p>JUnit 5.11.0-M1 = Platform 1.11.0-M1 + Jupiter 5.11.0-M1 + Vintage 5.11.0-M1</p>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/junit-team/junit5/commit/6b8e42b7a7d1606962341a61941c60b045646278"><code>6b8e42b</code></a> Release 5.11</li>
<li><a href="https://github.com/junit-team/junit5/commit/9430ecee6b99d9438c5a0204549ab88fc66ead86"><code>9430ece</code></a> Allow potentially unlimited maxCharsPerColumn in Csv{File}Source (<a href="https://redirect.github.com/junit-team/junit5/issues/3924">#3924</a>)</li>
<li><a href="https://github.com/junit-team/junit5/commit/0b10f86dd2e0a7fd232c1de032d1e2fbe312f615"><code>0b10f86</code></a> Polish release notes</li>
<li><a href="https://github.com/junit-team/junit5/commit/4dbd0f943efd53e49f8896ec1c9f677526c212cb"><code>4dbd0f9</code></a> Let <code>@ TempDir</code> fail fast with <code>File</code> annotated element and non-default file s...</li>
<li><a href="https://github.com/junit-team/junit5/commit/57f1ad4efd75236e531b9bcbad7c955eb1fb3943"><code>57f1ad4</code></a> Fix syntax</li>
<li><a href="https://github.com/junit-team/junit5/commit/d78730ae9f74bc63a136a29f5c5332154731c99b"><code>d78730a</code></a> Prioritize tasks on critical path of task graph</li>
<li><a href="https://github.com/junit-team/junit5/commit/b6719e2e05ea5001f25dc1628917d23d7e3e76dc"><code>b6719e2</code></a> Remove obsolete directory</li>
<li><a href="https://github.com/junit-team/junit5/commit/d8ec757357932e224ea081b1c8b9d993f143e75f"><code>d8ec757</code></a> Apply Spotless formatting to Gradle script plugins</li>
<li><a href="https://github.com/junit-team/junit5/commit/dae525d51c0811f69f3087b38f24fa9053a31d36"><code>dae525d</code></a> Disable caching of some Spotless tasks due to negative avoidance savings</li>
<li><a href="https://github.com/junit-team/junit5/commit/c63d11843506d908584ebde270d1b3b299417d54"><code>c63d118</code></a> Re-enable caching verifyOSGi tasks (issue was fixed in bnd 7.0.0)</li>
<li>Additional commits viewable in <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">compare view</a></li>
</ul>
</details>
<br />

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 54bb7a0ae0e..77feed12f3f 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -93,7 +93,7 @@ under the License.
   <properties>
     <target.gen.source.path>${project.build.directory}/generated-sources</target.gen.source.path>
     <dep.junit.platform.version>1.9.0</dep.junit.platform.version>
-    <dep.junit.jupiter.version>5.10.3</dep.junit.jupiter.version>
+    <dep.junit.jupiter.version>5.11.0</dep.junit.jupiter.version>
     <dep.slf4j.version>2.0.16</dep.slf4j.version>
     <dep.guava-bom.version>33.2.1-jre</dep.guava-bom.version>
     <dep.netty-bom.version>4.1.112.Final</dep.netty-bom.version>

From 2328b6ee39b497d9f48e6d342db9f7d0c34d9791 Mon Sep 17 00:00:00 2001
From: Rok Mihevc <rok@mihevc.org>
Date: Mon, 26 Aug 2024 16:34:18 +0200
Subject: [PATCH 072/157] GH-15058: [C++][Python] Native support for UUID
 (#37298)

### Rationale for this change

See #15058.
UUID datatype is common in throughout the ecosystem and Arrow as supporting it as a native type would reduce friction.

### What changes are included in this PR?

This PR implements logic for Arrow canonical extension type in C++ and a Python wrapper.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes, new extension type is added.
* Closes: #15058

Authored-by: Rok Mihevc <rok@mihevc.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/CMakeLists.txt                  |   3 +-
 cpp/src/arrow/acero/hash_join_node_test.cc    |   1 +
 cpp/src/arrow/extension/CMakeLists.txt        |   2 +-
 .../extension/fixed_shape_tensor_test.cc      |  17 +--
 cpp/src/arrow/extension/uuid.cc               |  58 ++++++++++
 cpp/src/arrow/extension/uuid.h                |  61 ++++++++++
 cpp/src/arrow/extension/uuid_test.cc          |  72 ++++++++++++
 cpp/src/arrow/extension_type.cc               |   4 +-
 cpp/src/arrow/extension_type_test.cc          |  19 +---
 .../integration/json_integration_test.cc      |   2 +-
 cpp/src/arrow/ipc/test_common.cc              |  35 ++++--
 cpp/src/arrow/ipc/test_common.h               |   3 +
 cpp/src/arrow/scalar_test.cc                  |   5 +-
 cpp/src/arrow/testing/extension_type.h        |   6 +-
 cpp/src/arrow/testing/gtest_util.cc           |  16 ++-
 dev/archery/archery/integration/datagen.py    |   2 +-
 docs/source/format/CanonicalExtensions.rst    |   2 +
 docs/source/status.rst                        |   2 +-
 python/pyarrow/__init__.py                    |  18 +--
 python/pyarrow/array.pxi                      |   6 +
 python/pyarrow/includes/libarrow.pxd          |  10 ++
 python/pyarrow/lib.pxd                        |   3 +
 python/pyarrow/public-api.pxi                 |  11 +-
 python/pyarrow/scalar.pxi                     |  10 ++
 python/pyarrow/src/arrow/python/gdb.cc        |  27 +----
 python/pyarrow/tests/extensions.pyx           |   2 +-
 python/pyarrow/tests/test_extension_type.py   | 105 ++++++++++++------
 python/pyarrow/tests/test_gdb.py              |   8 +-
 python/pyarrow/types.pxi                      |  34 ++++++
 29 files changed, 412 insertions(+), 132 deletions(-)
 create mode 100644 cpp/src/arrow/extension/uuid.cc
 create mode 100644 cpp/src/arrow/extension/uuid.h
 create mode 100644 cpp/src/arrow/extension/uuid_test.cc

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 89f28ee416e..6b0ac8c23c7 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -375,6 +375,7 @@ set(ARROW_SRCS
     device.cc
     extension_type.cc
     extension/bool8.cc
+    extension/uuid.cc
     pretty_print.cc
     record_batch.cc
     result.cc
@@ -1225,6 +1226,7 @@ add_subdirectory(testing)
 add_subdirectory(array)
 add_subdirectory(c)
 add_subdirectory(compute)
+add_subdirectory(extension)
 add_subdirectory(io)
 add_subdirectory(tensor)
 add_subdirectory(util)
@@ -1267,7 +1269,6 @@ endif()
 
 if(ARROW_JSON)
   add_subdirectory(json)
-  add_subdirectory(extension)
 endif()
 
 if(ARROW_ORC)
diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc
index 9065e286a22..76ad9c7d650 100644
--- a/cpp/src/arrow/acero/hash_join_node_test.cc
+++ b/cpp/src/arrow/acero/hash_join_node_test.cc
@@ -29,6 +29,7 @@
 #include "arrow/compute/kernels/test_util.h"
 #include "arrow/compute/light_array_internal.h"
 #include "arrow/compute/row/row_encoder_internal.h"
+#include "arrow/extension/uuid.h"
 #include "arrow/testing/extension_type.h"
 #include "arrow/testing/generator.h"
 #include "arrow/testing/gtest_util.h"
diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt
index 5cb4bc77af2..065ea3f1ddb 100644
--- a/cpp/src/arrow/extension/CMakeLists.txt
+++ b/cpp/src/arrow/extension/CMakeLists.txt
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set(CANONICAL_EXTENSION_TESTS bool8_test.cc)
+set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc)
 
 if(ARROW_JSON)
   list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc)
diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
index 3fd39a11ff5..842a78e1a4f 100644
--- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
+++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
@@ -23,7 +23,7 @@
 #include "arrow/array/array_primitive.h"
 #include "arrow/io/memory.h"
 #include "arrow/ipc/reader.h"
-#include "arrow/ipc/writer.h"
+#include "arrow/ipc/test_common.h"
 #include "arrow/record_batch.h"
 #include "arrow/tensor.h"
 #include "arrow/testing/gtest_util.h"
@@ -33,6 +33,7 @@
 namespace arrow {
 
 using FixedShapeTensorType = extension::FixedShapeTensorType;
+using arrow::ipc::test::RoundtripBatch;
 using extension::fixed_shape_tensor;
 using extension::FixedShapeTensorArray;
 
@@ -71,20 +72,6 @@ class TestExtensionType : public ::testing::Test {
   std::string serialized_;
 };
 
-auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
-                         std::shared_ptr<RecordBatch>* out) {
-  ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
-  ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
-                                        out_stream.get()));
-
-  ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
-
-  io::BufferReader reader(complete_ipc_stream);
-  std::shared_ptr<RecordBatchReader> batch_reader;
-  ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
-  ASSERT_OK(batch_reader->ReadNext(out));
-};
-
 TEST_F(TestExtensionType, CheckDummyRegistration) {
   // We need a registered dummy type at runtime to allow for IPC deserialization
   auto registered_type = GetExtensionType("arrow.fixed_shape_tensor");
diff --git a/cpp/src/arrow/extension/uuid.cc b/cpp/src/arrow/extension/uuid.cc
new file mode 100644
index 00000000000..43b917a17f8
--- /dev/null
+++ b/cpp/src/arrow/extension/uuid.cc
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+
+#include "arrow/extension_type.h"
+#include "arrow/util/logging.h"
+
+#include "arrow/extension/uuid.h"
+
+namespace arrow::extension {
+
+bool UuidType::ExtensionEquals(const ExtensionType& other) const {
+  return (other.extension_name() == this->extension_name());
+}
+
+std::shared_ptr<Array> UuidType::MakeArray(std::shared_ptr<ArrayData> data) const {
+  DCHECK_EQ(data->type->id(), Type::EXTENSION);
+  DCHECK_EQ("arrow.uuid",
+            static_cast<const ExtensionType&>(*data->type).extension_name());
+  return std::make_shared<UuidArray>(data);
+}
+
+Result<std::shared_ptr<DataType>> UuidType::Deserialize(
+    std::shared_ptr<DataType> storage_type, const std::string& serialized) const {
+  if (!serialized.empty()) {
+    return Status::Invalid("Unexpected serialized metadata: '", serialized, "'");
+  }
+  if (!storage_type->Equals(*fixed_size_binary(16))) {
+    return Status::Invalid("Invalid storage type for UuidType: ",
+                           storage_type->ToString());
+  }
+  return std::make_shared<UuidType>();
+}
+
+std::string UuidType::ToString(bool show_metadata) const {
+  std::stringstream ss;
+  ss << "extension<" << this->extension_name() << ">";
+  return ss.str();
+}
+
+std::shared_ptr<DataType> uuid() { return std::make_shared<UuidType>(); }
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/uuid.h b/cpp/src/arrow/extension/uuid.h
new file mode 100644
index 00000000000..42bb21cf0b2
--- /dev/null
+++ b/cpp/src/arrow/extension/uuid.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/extension_type.h"
+
+namespace arrow::extension {
+
+/// \brief UuidArray stores array of UUIDs. Underlying storage type is
+/// FixedSizeBinary(16).
+class ARROW_EXPORT UuidArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+/// \brief UuidType is a canonical arrow extension type for UUIDs.
+/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this
+/// does not interpret the bytes in any way. Specific UUID version is not
+/// required or guaranteed.
+class ARROW_EXPORT UuidType : public ExtensionType {
+ public:
+  /// \brief Construct a UuidType.
+  UuidType() : ExtensionType(fixed_size_binary(16)) {}
+
+  std::string extension_name() const override { return "arrow.uuid"; }
+  std::string ToString(bool show_metadata = false) const override;
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  /// Create a UuidArray from ArrayData
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override { return ""; }
+
+  /// \brief Create a UuidType instance
+  static Result<std::shared_ptr<DataType>> Make() { return std::make_shared<UuidType>(); }
+};
+
+/// \brief Return a UuidType instance.
+ARROW_EXPORT std::shared_ptr<DataType> uuid();
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc
new file mode 100644
index 00000000000..3bbb6eeb4ae
--- /dev/null
+++ b/cpp/src/arrow/extension/uuid_test.cc
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/uuid.h"
+
+#include "arrow/testing/matchers.h"
+
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/test_common.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/key_value_metadata.h"
+
+#include "arrow/testing/extension_type.h"
+
+namespace arrow {
+
+using arrow::ipc::test::RoundtripBatch;
+
+TEST(TestUuuidExtensionType, ExtensionTypeTest) {
+  auto type = uuid();
+  ASSERT_EQ(type->id(), Type::EXTENSION);
+
+  const auto& ext_type = static_cast<const ExtensionType&>(*type);
+  std::string serialized = ext_type.Serialize();
+
+  ASSERT_OK_AND_ASSIGN(auto deserialized,
+                       ext_type.Deserialize(fixed_size_binary(16), serialized));
+  ASSERT_TRUE(deserialized->Equals(*type));
+  ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16)));
+}
+
+TEST(TestUuuidExtensionType, RoundtripBatch) {
+  auto ext_type = extension::uuid();
+  auto exact_ext_type = internal::checked_pointer_cast<extension::UuidType>(ext_type);
+  auto arr = ArrayFromJSON(fixed_size_binary(16), R"(["abcdefghijklmnop", null])");
+  auto ext_arr = ExtensionType::WrapArray(ext_type, arr);
+
+  // Pass extension array, expect getting back extension array
+  std::shared_ptr<RecordBatch> read_batch;
+  auto ext_field = field(/*name=*/"f0", /*type=*/ext_type);
+  auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr});
+  RoundtripBatch(batch, &read_batch);
+  CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);
+
+  // Pass extension metadata and storage array, expect getting back extension array
+  std::shared_ptr<RecordBatch> read_batch2;
+  auto ext_metadata =
+      key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()},
+                          {"ARROW:extension:metadata", ""}});
+  ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(),
+                    /*nullable=*/true, /*metadata=*/ext_metadata);
+  auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr});
+  RoundtripBatch(batch2, &read_batch2);
+  CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true);
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index 83c7ebed4f3..fc220f73a6b 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -32,6 +32,7 @@
 #include "arrow/extension/fixed_shape_tensor.h"
 #include "arrow/extension/opaque.h"
 #endif
+#include "arrow/extension/uuid.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
@@ -147,14 +148,13 @@ static void CreateGlobalRegistry() {
   // Register canonical extension types
 
   g_registry = std::make_shared<ExtensionTypeRegistryImpl>();
-  std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8()};
+  std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8(), extension::uuid()};
 
 #ifdef ARROW_JSON
   ext_types.push_back(extension::fixed_shape_tensor(int64(), {}));
   ext_types.push_back(extension::opaque(null(), "", ""));
 #endif
 
-  // Register canonical extension types
   for (const auto& ext_type : ext_types) {
     ARROW_CHECK_OK(
         g_registry->RegisterType(checked_pointer_cast<ExtensionType>(ext_type)));
diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc
index f104c984a64..f49ffc5cba5 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -30,6 +30,7 @@
 #include "arrow/io/memory.h"
 #include "arrow/ipc/options.h"
 #include "arrow/ipc/reader.h"
+#include "arrow/ipc/test_common.h"
 #include "arrow/ipc/writer.h"
 #include "arrow/record_batch.h"
 #include "arrow/status.h"
@@ -41,6 +42,8 @@
 
 namespace arrow {
 
+using arrow::ipc::test::RoundtripBatch;
+
 class Parametric1Array : public ExtensionArray {
  public:
   using ExtensionArray::ExtensionArray;
@@ -178,7 +181,7 @@ class ExtStructType : public ExtensionType {
 
 class TestExtensionType : public ::testing::Test {
  public:
-  void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared<UuidType>())); }
+  void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared<ExampleUuidType>())); }
 
   void TearDown() {
     if (GetExtensionType("uuid")) {
@@ -211,20 +214,6 @@ TEST_F(TestExtensionType, ExtensionTypeTest) {
   ASSERT_EQ(deserialized->byte_width(), 16);
 }
 
-auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
-                         std::shared_ptr<RecordBatch>* out) {
-  ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
-  ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
-                                        out_stream.get()));
-
-  ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
-
-  io::BufferReader reader(complete_ipc_stream);
-  std::shared_ptr<RecordBatchReader> batch_reader;
-  ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
-  ASSERT_OK(batch_reader->ReadNext(out));
-};
-
 TEST_F(TestExtensionType, IpcRoundtrip) {
   auto ext_arr = ExampleUuid();
   auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr});
diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc
index 9b56928c688..0e84ea6124d 100644
--- a/cpp/src/arrow/integration/json_integration_test.cc
+++ b/cpp/src/arrow/integration/json_integration_test.cc
@@ -1046,7 +1046,7 @@ TEST(TestJsonFileReadWrite, JsonExample2) {
 
     auto storage_array =
         ArrayFromJSON(fixed_size_binary(16), R"(["0123456789abcdef", null])");
-    AssertArraysEqual(*batch->column(0), UuidArray(uuid_type, storage_array));
+    AssertArraysEqual(*batch->column(0), ExampleUuidArray(uuid_type, storage_array));
 
     AssertArraysEqual(*batch->column(1), NullArray(2));
   }
diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc
index 87c02e2d87a..fb4f6bd8ead 100644
--- a/cpp/src/arrow/ipc/test_common.cc
+++ b/cpp/src/arrow/ipc/test_common.cc
@@ -27,8 +27,10 @@
 #include "arrow/array.h"
 #include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_primitive.h"
-#include "arrow/array/builder_time.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
 #include "arrow/ipc/test_common.h"
+#include "arrow/ipc/writer.h"
 #include "arrow/pretty_print.h"
 #include "arrow/record_batch.h"
 #include "arrow/status.h"
@@ -242,11 +244,11 @@ Status MakeRandomBooleanArray(const int length, bool include_nulls,
                               std::shared_ptr<Array>* out) {
   std::vector<uint8_t> values(length);
   random_null_bytes(length, 0.5, values.data());
-  ARROW_ASSIGN_OR_RAISE(auto data, internal::BytesToBits(values));
+  ARROW_ASSIGN_OR_RAISE(auto data, arrow::internal::BytesToBits(values));
 
   if (include_nulls) {
     std::vector<uint8_t> valid_bytes(length);
-    ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(valid_bytes));
+    ARROW_ASSIGN_OR_RAISE(auto null_bitmap, arrow::internal::BytesToBits(valid_bytes));
     random_null_bytes(length, 0.1, valid_bytes.data());
     *out = std::make_shared<BooleanArray>(length, data, null_bitmap, -1);
   } else {
@@ -596,7 +598,7 @@ Status MakeStruct(std::shared_ptr<RecordBatch>* out) {
   std::shared_ptr<Array> no_nulls(new StructArray(type, list_batch->num_rows(), columns));
   std::vector<uint8_t> null_bytes(list_batch->num_rows(), 1);
   null_bytes[0] = 0;
-  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes));
+  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, arrow::internal::BytesToBits(null_bytes));
   std::shared_ptr<Array> with_nulls(
       new StructArray(type, list_batch->num_rows(), columns, null_bitmap, 1));
 
@@ -1088,9 +1090,9 @@ Status MakeUuid(std::shared_ptr<RecordBatch>* out) {
   auto f1 = field("f1", uuid_type, /*nullable=*/false);
   auto schema = ::arrow::schema({f0, f1});
 
-  auto a0 = std::make_shared<UuidArray>(
+  auto a0 = std::make_shared<ExampleUuidArray>(
       uuid_type, ArrayFromJSON(storage_type, R"(["0123456789abcdef", null])"));
-  auto a1 = std::make_shared<UuidArray>(
+  auto a1 = std::make_shared<ExampleUuidArray>(
       uuid_type,
       ArrayFromJSON(storage_type, R"(["ZYXWVUTSRQPONMLK", "JIHGFEDBA9876543"])"));
 
@@ -1176,12 +1178,13 @@ enable_if_t<std::is_floating_point<CValueType>::value, void> FillRandomData(
 Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
                         const std::vector<int64_t>& shape, bool row_major_p,
                         std::shared_ptr<Tensor>* out, uint32_t seed) {
-  const auto& element_type = internal::checked_cast<const FixedWidthType&>(*type);
+  const auto& element_type = arrow::internal::checked_cast<const FixedWidthType&>(*type);
   std::vector<int64_t> strides;
   if (row_major_p) {
-    RETURN_NOT_OK(internal::ComputeRowMajorStrides(element_type, shape, &strides));
+    RETURN_NOT_OK(arrow::internal::ComputeRowMajorStrides(element_type, shape, &strides));
   } else {
-    RETURN_NOT_OK(internal::ComputeColumnMajorStrides(element_type, shape, &strides));
+    RETURN_NOT_OK(
+        arrow::internal::ComputeColumnMajorStrides(element_type, shape, &strides));
   }
 
   const int64_t element_size = element_type.bit_width() / CHAR_BIT;
@@ -1233,6 +1236,20 @@ Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
   return Tensor::Make(type, buf, shape, strides).Value(out);
 }
 
+void RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
+                    std::shared_ptr<RecordBatch>* out) {
+  ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
+  ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
+                                        out_stream.get()));
+
+  ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
+
+  io::BufferReader reader(complete_ipc_stream);
+  std::shared_ptr<RecordBatchReader> batch_reader;
+  ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
+  ASSERT_OK(batch_reader->ReadNext(out));
+}
+
 }  // namespace test
 }  // namespace ipc
 }  // namespace arrow
diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h
index db8613cbb1e..9b7e7f13e3a 100644
--- a/cpp/src/arrow/ipc/test_common.h
+++ b/cpp/src/arrow/ipc/test_common.h
@@ -184,6 +184,9 @@ Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
                         const std::vector<int64_t>& shape, bool row_major_p,
                         std::shared_ptr<Tensor>* out, uint32_t seed = 0);
 
+ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
+                                         std::shared_ptr<RecordBatch>* out);
+
 }  // namespace test
 }  // namespace ipc
 }  // namespace arrow
diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc
index 104a5697b57..e9ec13e98b4 100644
--- a/cpp/src/arrow/scalar_test.cc
+++ b/cpp/src/arrow/scalar_test.cc
@@ -43,7 +43,6 @@ namespace arrow {
 
 using compute::Cast;
 using compute::CastOptions;
-
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
@@ -2038,7 +2037,7 @@ class TestExtensionScalar : public ::testing::Test {
   void SetUp() {
     type_ = uuid();
     storage_type_ = fixed_size_binary(16);
-    uuid_type_ = checked_cast<const UuidType*>(type_.get());
+    uuid_type_ = checked_cast<const ExampleUuidType*>(type_.get());
   }
 
  protected:
@@ -2049,7 +2048,7 @@ class TestExtensionScalar : public ::testing::Test {
   }
 
   std::shared_ptr<DataType> type_, storage_type_;
-  const UuidType* uuid_type_{nullptr};
+  const ExampleUuidType* uuid_type_{nullptr};
 
   const std::string_view uuid_string1_{UUID_STRING1};
   const std::string_view uuid_string2_{UUID_STRING2};
diff --git a/cpp/src/arrow/testing/extension_type.h b/cpp/src/arrow/testing/extension_type.h
index 6515631f202..a4526e31c2b 100644
--- a/cpp/src/arrow/testing/extension_type.h
+++ b/cpp/src/arrow/testing/extension_type.h
@@ -27,14 +27,14 @@
 
 namespace arrow {
 
-class ARROW_TESTING_EXPORT UuidArray : public ExtensionArray {
+class ARROW_TESTING_EXPORT ExampleUuidArray : public ExtensionArray {
  public:
   using ExtensionArray::ExtensionArray;
 };
 
-class ARROW_TESTING_EXPORT UuidType : public ExtensionType {
+class ARROW_TESTING_EXPORT ExampleUuidType : public ExtensionType {
  public:
-  UuidType() : ExtensionType(fixed_size_binary(16)) {}
+  ExampleUuidType() : ExtensionType(fixed_size_binary(16)) {}
 
   std::string extension_name() const override { return "uuid"; }
 
diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index 95de16c715f..ae2e53b30a3 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -49,9 +49,13 @@
 #include "arrow/buffer.h"
 #include "arrow/compute/api_vector.h"
 #include "arrow/datum.h"
+#include "arrow/io/memory.h"
 #include "arrow/ipc/json_simple.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
 #include "arrow/json/rapidjson_defs.h"  // IWYU pragma: keep
 #include "arrow/pretty_print.h"
+#include "arrow/record_batch.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
 #include "arrow/tensor.h"
@@ -847,17 +851,17 @@ Future<> SleepABitAsync() {
 ///////////////////////////////////////////////////////////////////////////
 // Extension types
 
-bool UuidType::ExtensionEquals(const ExtensionType& other) const {
+bool ExampleUuidType::ExtensionEquals(const ExtensionType& other) const {
   return (other.extension_name() == this->extension_name());
 }
 
-std::shared_ptr<Array> UuidType::MakeArray(std::shared_ptr<ArrayData> data) const {
+std::shared_ptr<Array> ExampleUuidType::MakeArray(std::shared_ptr<ArrayData> data) const {
   DCHECK_EQ(data->type->id(), Type::EXTENSION);
   DCHECK_EQ("uuid", static_cast<const ExtensionType&>(*data->type).extension_name());
-  return std::make_shared<UuidArray>(data);
+  return std::make_shared<ExampleUuidArray>(data);
 }
 
-Result<std::shared_ptr<DataType>> UuidType::Deserialize(
+Result<std::shared_ptr<DataType>> ExampleUuidType::Deserialize(
     std::shared_ptr<DataType> storage_type, const std::string& serialized) const {
   if (serialized != "uuid-serialized") {
     return Status::Invalid("Type identifier did not match: '", serialized, "'");
@@ -866,7 +870,7 @@ Result<std::shared_ptr<DataType>> UuidType::Deserialize(
     return Status::Invalid("Invalid storage type for UuidType: ",
                            storage_type->ToString());
   }
-  return std::make_shared<UuidType>();
+  return std::make_shared<ExampleUuidType>();
 }
 
 bool SmallintType::ExtensionEquals(const ExtensionType& other) const {
@@ -982,7 +986,7 @@ Result<std::shared_ptr<DataType>> Complex128Type::Deserialize(
   return std::make_shared<Complex128Type>();
 }
 
-std::shared_ptr<DataType> uuid() { return std::make_shared<UuidType>(); }
+std::shared_ptr<DataType> uuid() { return std::make_shared<ExampleUuidType>(); }
 
 std::shared_ptr<DataType> smallint() { return std::make_shared<SmallintType>(); }
 
diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index d395d26cb71..f63aa0d95a4 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -1845,7 +1845,7 @@ def generate_nested_dictionary_case():
 def generate_extension_case():
     dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0')
 
-    uuid_type = ExtensionType('uuid', 'uuid-serialized',
+    uuid_type = ExtensionType('arrow.uuid', '',
                               FixedSizeBinaryField('', 16))
     dict_ext_type = ExtensionType(
         'dict-extension', 'dict-extension-serialized',
diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst
index 5658f949cee..1106f8aaffd 100644
--- a/docs/source/format/CanonicalExtensions.rst
+++ b/docs/source/format/CanonicalExtensions.rst
@@ -272,6 +272,8 @@ JSON
   In the future, additional fields may be added, but they are not required
   to interpret the array.
 
+.. _uuid_extension:
+
 UUID
 ====
 
diff --git a/docs/source/status.rst b/docs/source/status.rst
index 5e2c2cc19c8..b685d4bbf8a 100644
--- a/docs/source/status.rst
+++ b/docs/source/status.rst
@@ -121,7 +121,7 @@ Data Types
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
 | JSON                  |       |       | ✓     |            |       |       |       |       |
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
-| UUID                  |       |       | ✓     |            |       |       |       |       |
+| UUID                  | ✓     |       | ✓     |            |       |       |       |       |
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
 | 8-bit Boolean         | ✓     |       | ✓     |            |       |       |       |       |
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 807bcdc3150..d31c93119b7 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -172,9 +172,7 @@ def print_entry(label, value):
                          union, sparse_union, dense_union,
                          dictionary,
                          run_end_encoded,
-                         fixed_shape_tensor,
-                         opaque,
-                         bool8,
+                         bool8, fixed_shape_tensor, opaque, uuid,
                          field,
                          type_for_alias,
                          DataType, DictionaryType, StructType,
@@ -184,8 +182,9 @@ def print_entry(label, value):
                          TimestampType, Time32Type, Time64Type, DurationType,
                          FixedSizeBinaryType, Decimal128Type, Decimal256Type,
                          BaseExtensionType, ExtensionType,
-                         RunEndEncodedType, FixedShapeTensorType, OpaqueType,
-                         Bool8Type, PyExtensionType, UnknownExtensionType,
+                         RunEndEncodedType, Bool8Type, FixedShapeTensorType,
+                         OpaqueType, UuidType,
+                         PyExtensionType, UnknownExtensionType,
                          register_extension_type, unregister_extension_type,
                          DictionaryMemo,
                          KeyValueMetadata,
@@ -218,8 +217,9 @@ def print_entry(label, value):
                          Time32Array, Time64Array, DurationArray,
                          MonthDayNanoIntervalArray,
                          Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
-                         RunEndEncodedArray, FixedShapeTensorArray, OpaqueArray,
-                         Bool8Array, scalar, NA, _NULL as NULL, Scalar,
+                         RunEndEncodedArray, Bool8Array, FixedShapeTensorArray,
+                         OpaqueArray, UuidArray,
+                         scalar, NA, _NULL as NULL, Scalar,
                          NullScalar, BooleanScalar,
                          Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
                          UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
@@ -235,8 +235,8 @@ def print_entry(label, value):
                          StringScalar, LargeStringScalar, StringViewScalar,
                          FixedSizeBinaryScalar, DictionaryScalar,
                          MapScalar, StructScalar, UnionScalar,
-                         RunEndEncodedScalar, ExtensionScalar,
-                         FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar)
+                         RunEndEncodedScalar, Bool8Scalar, ExtensionScalar,
+                         FixedShapeTensorScalar, OpaqueScalar, UuidScalar)
 
 # Buffers, allocation
 from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 77d6c9c06d2..1587de0e6b7 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -4338,6 +4338,12 @@ cdef class ExtensionArray(Array):
         return result
 
 
+class UuidArray(ExtensionArray):
+    """
+    Concrete class for Arrow arrays of UUID data type.
+    """
+
+
 cdef class FixedShapeTensorArray(ExtensionArray):
     """
     Concrete class for fixed shape tensor extension arrays.
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 6f510cfc0c0..c2346750a19 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2865,6 +2865,16 @@ cdef extern from "arrow/extension_type.h" namespace "arrow":
         shared_ptr[CArray] storage()
 
 
+cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil:
+    cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType):
+
+        @staticmethod
+        CResult[shared_ptr[CDataType]] Make()
+
+    cdef cppclass CUuidArray" arrow::extension::UuidArray"(CExtensionArray):
+        pass
+
+
 cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil:
     cdef cppclass CFixedShapeTensorType \
             " arrow::extension::FixedShapeTensorType"(CExtensionType):
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index a7c3b496a00..5c3d981c3ad 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -222,6 +222,9 @@ cdef class OpaqueType(BaseExtensionType):
     cdef:
         const COpaqueType* opaque_ext_type
 
+cdef class UuidType(BaseExtensionType):
+    cdef:
+        const CUuidType* uuid_ext_type
 
 cdef class PyExtensionType(ExtensionType):
     pass
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 19a26bd6c68..d3e2ff2e99d 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -120,14 +120,17 @@ cdef api object pyarrow_wrap_data_type(
     elif type.get().id() == _Type_EXTENSION:
         ext_type = <const CExtensionType*> type.get()
         cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type)
+        extension_name = ext_type.extension_name()
         if cpy_ext_type != nullptr:
             return cpy_ext_type.GetInstance()
-        elif ext_type.extension_name() == b"arrow.fixed_shape_tensor":
+        elif extension_name == b"arrow.bool8":
+            out = Bool8Type.__new__(Bool8Type)
+        elif extension_name == b"arrow.fixed_shape_tensor":
             out = FixedShapeTensorType.__new__(FixedShapeTensorType)
-        elif ext_type.extension_name() == b"arrow.opaque":
+        elif extension_name == b"arrow.opaque":
             out = OpaqueType.__new__(OpaqueType)
-        elif ext_type.extension_name() == b"arrow.bool8":
-            out = Bool8Type.__new__(Bool8Type)
+        elif extension_name == b"arrow.uuid":
+            out = UuidType.__new__(UuidType)
         else:
             out = BaseExtensionType.__new__(BaseExtensionType)
     else:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 72ae2aee5f8..68f77832c43 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -17,6 +17,7 @@
 
 import collections
 from cython cimport binding
+from uuid import UUID
 
 
 cdef class Scalar(_Weakrefable):
@@ -1043,6 +1044,15 @@ cdef class ExtensionScalar(Scalar):
         return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar)
 
 
+class UuidScalar(ExtensionScalar):
+    """
+    Concrete class for Uuid extension scalar.
+    """
+
+    def as_py(self):
+        return None if self.value is None else UUID(bytes=self.value.as_py())
+
+
 cdef class FixedShapeTensorScalar(ExtensionScalar):
     """
     Concrete class for fixed shape tensor extension scalar.
diff --git a/python/pyarrow/src/arrow/python/gdb.cc b/python/pyarrow/src/arrow/python/gdb.cc
index 6941769e4ef..7c58bae3342 100644
--- a/python/pyarrow/src/arrow/python/gdb.cc
+++ b/python/pyarrow/src/arrow/python/gdb.cc
@@ -22,7 +22,7 @@
 #include "arrow/array.h"
 #include "arrow/chunked_array.h"
 #include "arrow/datum.h"
-#include "arrow/extension_type.h"
+#include "arrow/extension/uuid.h"
 #include "arrow/ipc/json_simple.h"
 #include "arrow/python/gdb.h"
 #include "arrow/record_batch.h"
@@ -37,6 +37,8 @@
 
 namespace arrow {
 
+using extension::uuid;
+using extension::UuidType;
 using ipc::internal::json::ArrayFromJSON;
 using ipc::internal::json::ChunkedArrayFromJSON;
 using ipc::internal::json::ScalarFromJSON;
@@ -56,29 +58,6 @@ class CustomStatusDetail : public StatusDetail {
   std::string ToString() const override { return "This is a detail"; }
 };
 
-class UuidType : public ExtensionType {
- public:
-  UuidType() : ExtensionType(fixed_size_binary(16)) {}
-
-  std::string extension_name() const override { return "uuid"; }
-
-  bool ExtensionEquals(const ExtensionType& other) const override {
-    return (other.extension_name() == this->extension_name());
-  }
-
-  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override {
-    return std::make_shared<ExtensionArray>(data);
-  }
-
-  Result<std::shared_ptr<DataType>> Deserialize(
-      std::shared_ptr<DataType> storage_type,
-      const std::string& serialized) const override {
-    return Status::NotImplemented("");
-  }
-
-  std::string Serialize() const override { return "uuid-serialized"; }
-};
-
 std::shared_ptr<Array> SliceArrayFromJSON(const std::shared_ptr<DataType>& ty,
                                           std::string_view json, int64_t offset = 0,
                                           int64_t length = -1) {
diff --git a/python/pyarrow/tests/extensions.pyx b/python/pyarrow/tests/extensions.pyx
index c1bf9aae1ec..309b574dc02 100644
--- a/python/pyarrow/tests/extensions.pyx
+++ b/python/pyarrow/tests/extensions.pyx
@@ -37,7 +37,7 @@ cdef extern from * namespace "arrow::py" nogil:
     class UuidType : public ExtensionType {
     public:
         UuidType() : ExtensionType(fixed_size_binary(16)) {}
-        std::string extension_name() const override { return "uuid"; }
+        std::string extension_name() const override { return "example-uuid"; }
 
         bool ExtensionEquals(const ExtensionType& other) const override {
             return other.extension_name() == this->extension_name();
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index 0d50c467e96..aacbd2cb6e7 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -95,18 +95,21 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized):
         return cls()
 
 
-class UuidScalarType(pa.ExtensionScalar):
+class ExampleUuidScalarType(pa.ExtensionScalar):
     def as_py(self):
         return None if self.value is None else UUID(bytes=self.value.as_py())
 
 
-class UuidType(pa.ExtensionType):
+class ExampleUuidType(pa.ExtensionType):
 
     def __init__(self):
-        super().__init__(pa.binary(16), 'pyarrow.tests.UuidType')
+        super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType')
+
+    def __reduce__(self):
+        return ExampleUuidType, ()
 
     def __arrow_ext_scalar_class__(self):
-        return UuidScalarType
+        return ExampleUuidScalarType
 
     def __arrow_ext_serialize__(self):
         return b''
@@ -116,10 +119,10 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized):
         return cls()
 
 
-class UuidType2(pa.ExtensionType):
+class ExampleUuidType2(pa.ExtensionType):
 
     def __init__(self):
-        super().__init__(pa.binary(16), 'pyarrow.tests.UuidType2')
+        super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType2')
 
     def __arrow_ext_serialize__(self):
         return b''
@@ -250,8 +253,8 @@ def ipc_read_batch(buf):
 
 
 def test_ext_type_basics():
-    ty = UuidType()
-    assert ty.extension_name == "pyarrow.tests.UuidType"
+    ty = ExampleUuidType()
+    assert ty.extension_name == "pyarrow.tests.ExampleUuidType"
 
 
 def test_ext_type_str():
@@ -267,16 +270,16 @@ def test_ext_type_repr():
 
 
 def test_ext_type_lifetime():
-    ty = UuidType()
+    ty = ExampleUuidType()
     wr = weakref.ref(ty)
     del ty
     assert wr() is None
 
 
 def test_ext_type_storage_type():
-    ty = UuidType()
+    ty = ExampleUuidType()
     assert ty.storage_type == pa.binary(16)
-    assert ty.__class__ is UuidType
+    assert ty.__class__ is ExampleUuidType
     ty = ParamExtType(5)
     assert ty.storage_type == pa.binary(5)
     assert ty.__class__ is ParamExtType
@@ -284,7 +287,7 @@ def test_ext_type_storage_type():
 
 def test_ext_type_byte_width():
     # Test for fixed-size binary types
-    ty = UuidType()
+    ty = pa.uuid()
     assert ty.byte_width == 16
     ty = ParamExtType(5)
     assert ty.byte_width == 5
@@ -297,7 +300,7 @@ def test_ext_type_byte_width():
 
 def test_ext_type_bit_width():
     # Test for fixed-size binary types
-    ty = UuidType()
+    ty = pa.uuid()
     assert ty.bit_width == 128
     ty = ParamExtType(5)
     assert ty.bit_width == 40
@@ -309,7 +312,7 @@ def test_ext_type_bit_width():
 
 
 def test_ext_type_as_py():
-    ty = UuidType()
+    ty = ExampleUuidType()
     expected = uuid4()
     scalar = pa.ExtensionScalar.from_storage(ty, expected.bytes)
     assert scalar.as_py() == expected
@@ -342,12 +345,22 @@ def test_ext_type_as_py():
 
 def test_uuid_type_pickle(pickle_module):
     for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
-        ty = UuidType()
+        ty = ExampleUuidType()
         ser = pickle_module.dumps(ty, protocol=proto)
         del ty
         ty = pickle_module.loads(ser)
         wr = weakref.ref(ty)
-        assert ty.extension_name == "pyarrow.tests.UuidType"
+        assert ty.extension_name == "pyarrow.tests.ExampleUuidType"
+        del ty
+        assert wr() is None
+
+    for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
+        ty = pa.uuid()
+        ser = pickle_module.dumps(ty, protocol=proto)
+        del ty
+        ty = pickle_module.loads(ser)
+        wr = weakref.ref(ty)
+        assert ty.extension_name == "arrow.uuid"
         del ty
         assert wr() is None
 
@@ -358,8 +371,8 @@ def test_ext_type_equality():
     c = ParamExtType(6)
     assert a != b
     assert b == c
-    d = UuidType()
-    e = UuidType()
+    d = ExampleUuidType()
+    e = ExampleUuidType()
     assert a != d
     assert d == e
 
@@ -403,7 +416,7 @@ def test_ext_array_equality():
     storage1 = pa.array([b"0123456789abcdef"], type=pa.binary(16))
     storage2 = pa.array([b"0123456789abcdef"], type=pa.binary(16))
     storage3 = pa.array([], type=pa.binary(16))
-    ty1 = UuidType()
+    ty1 = ExampleUuidType()
     ty2 = ParamExtType(16)
 
     a = pa.ExtensionArray.from_storage(ty1, storage1)
@@ -451,9 +464,9 @@ def test_ext_scalar_from_array():
     data = [b"0123456789abcdef", b"0123456789abcdef",
             b"zyxwvutsrqponmlk", None]
     storage = pa.array(data, type=pa.binary(16))
-    ty1 = UuidType()
+    ty1 = ExampleUuidType()
     ty2 = ParamExtType(16)
-    ty3 = UuidType2()
+    ty3 = ExampleUuidType2()
 
     a = pa.ExtensionArray.from_storage(ty1, storage)
     b = pa.ExtensionArray.from_storage(ty2, storage)
@@ -462,9 +475,9 @@ def test_ext_scalar_from_array():
     scalars_a = list(a)
     assert len(scalars_a) == 4
 
-    assert ty1.__arrow_ext_scalar_class__() == UuidScalarType
-    assert isinstance(a[0], UuidScalarType)
-    assert isinstance(scalars_a[0], UuidScalarType)
+    assert ty1.__arrow_ext_scalar_class__() == ExampleUuidScalarType
+    assert isinstance(a[0], ExampleUuidScalarType)
+    assert isinstance(scalars_a[0], ExampleUuidScalarType)
 
     for s, val in zip(scalars_a, data):
         assert isinstance(s, pa.ExtensionScalar)
@@ -505,7 +518,7 @@ def test_ext_scalar_from_array():
 
 
 def test_ext_scalar_from_storage():
-    ty = UuidType()
+    ty = ExampleUuidType()
 
     s = pa.ExtensionScalar.from_storage(ty, None)
     assert isinstance(s, pa.ExtensionScalar)
@@ -706,14 +719,14 @@ def test_cast_between_extension_types():
     tiny_int_arr.cast(pa.int64()).cast(IntegerType())
 
     # Between the same extension types is okay
-    array = pa.array([b'1' * 16, b'2' * 16], pa.binary(16)).cast(UuidType())
-    out = array.cast(UuidType())
-    assert out.type == UuidType()
+    array = pa.array([b'1' * 16, b'2' * 16], pa.binary(16)).cast(ExampleUuidType())
+    out = array.cast(ExampleUuidType())
+    assert out.type == ExampleUuidType()
 
     # Will still fail casting between extensions who share storage type,
     # can only cast between exactly the same extension types.
     with pytest.raises(TypeError, match='Casting from *'):
-        array.cast(UuidType2())
+        array.cast(ExampleUuidType2())
 
 
 def test_cast_to_extension_with_extension_storage():
@@ -744,10 +757,10 @@ def test_cast_nested_extension_types(data, type_factory):
 
 def test_casting_dict_array_to_extension_type():
     storage = pa.array([b"0123456789abcdef"], type=pa.binary(16))
-    arr = pa.ExtensionArray.from_storage(UuidType(), storage)
+    arr = pa.ExtensionArray.from_storage(ExampleUuidType(), storage)
     dict_arr = pa.DictionaryArray.from_arrays(pa.array([0, 0], pa.int32()),
                                               arr)
-    out = dict_arr.cast(UuidType())
+    out = dict_arr.cast(ExampleUuidType())
     assert isinstance(out, pa.ExtensionArray)
     assert out.to_pylist() == [UUID('30313233-3435-3637-3839-616263646566'),
                                UUID('30313233-3435-3637-3839-616263646566')]
@@ -1347,7 +1360,7 @@ def test_cpp_extension_in_python(tmpdir):
     mod = __import__('extensions')
 
     uuid_type = mod._make_uuid_type()
-    assert uuid_type.extension_name == "uuid"
+    assert uuid_type.extension_name == "example-uuid"
     assert uuid_type.storage_type == pa.binary(16)
 
     array = mod._make_uuid_array()
@@ -1356,6 +1369,31 @@ def test_cpp_extension_in_python(tmpdir):
     assert array[0].as_py() == b'abcdefghijklmno0'
     assert array[1].as_py() == b'0onmlkjihgfedcba'
 
+    buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["example-uuid"]))
+
+    batch = ipc_read_batch(buf)
+    reconstructed_array = batch.column(0)
+    assert reconstructed_array.type == uuid_type
+    assert reconstructed_array == array
+
+
+def test_uuid_extension():
+    data = [b"0123456789abcdef", b"0123456789abcdef",
+            b"zyxwvutsrqponmlk", None]
+
+    uuid_type = pa.uuid()
+    assert uuid_type.extension_name == "arrow.uuid"
+    assert uuid_type.storage_type == pa.binary(16)
+    assert uuid_type.__class__ is pa.UuidType
+
+    storage = pa.array(data, pa.binary(16))
+    array = pa.ExtensionArray.from_storage(uuid_type, storage)
+    assert array.type == uuid_type
+
+    assert array.to_pylist() == [x if x is None else UUID(bytes=x) for x in data]
+    assert array[0].as_py() == UUID(bytes=data[0])
+    assert array[3].as_py() is None
+
     buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["uuid"]))
 
     batch = ipc_read_batch(buf)
@@ -1363,6 +1401,9 @@ def test_cpp_extension_in_python(tmpdir):
     assert reconstructed_array.type == uuid_type
     assert reconstructed_array == array
 
+    assert uuid_type.__arrow_ext_scalar_class__() == pa.UuidScalar
+    assert isinstance(array[0], pa.UuidScalar)
+
 
 def test_tensor_type():
     tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py
index 0d12d710dcf..2ac2f55754f 100644
--- a/python/pyarrow/tests/test_gdb.py
+++ b/python/pyarrow/tests/test_gdb.py
@@ -409,7 +409,7 @@ def test_types_stack(gdb_arrow):
 
     check_stack_repr(
         gdb_arrow, "uuid_type",
-        ('arrow::ExtensionType "extension<uuid>" '
+        ('arrow::ExtensionType "extension<arrow.uuid>" '
          'with storage type arrow::fixed_size_binary(16)'))
 
 
@@ -447,7 +447,7 @@ def test_types_heap(gdb_arrow):
 
     check_heap_repr(
         gdb_arrow, "heap_uuid_type",
-        ('arrow::ExtensionType "extension<uuid>" '
+        ('arrow::ExtensionType "extension<arrow.uuid>" '
          'with storage type arrow::fixed_size_binary(16)'))
 
 
@@ -716,12 +716,12 @@ def test_scalars_stack(gdb_arrow):
 
     check_stack_repr(
         gdb_arrow, "extension_scalar",
-        ('arrow::ExtensionScalar of type "extension<uuid>", '
+        ('arrow::ExtensionScalar of type "extension<arrow.uuid>", '
          'value arrow::FixedSizeBinaryScalar of size 16, '
          'value "0123456789abcdef"'))
     check_stack_repr(
         gdb_arrow, "extension_scalar_null",
-        'arrow::ExtensionScalar of type "extension<uuid>", null value')
+        'arrow::ExtensionScalar of type "extension<arrow.uuid>", null value')
 
 
 def test_scalars_heap(gdb_arrow):
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 563782f0c26..f83ecc3aa43 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1765,6 +1765,25 @@ cdef class ExtensionType(BaseExtensionType):
         return ExtensionScalar
 
 
+cdef class UuidType(BaseExtensionType):
+    """
+    Concrete class for UUID extension type.
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        BaseExtensionType.init(self, type)
+        self.uuid_ext_type = <const CUuidType*> type.get()
+
+    def __arrow_ext_class__(self):
+        return UuidArray
+
+    def __reduce__(self):
+        return uuid, ()
+
+    def __arrow_ext_scalar_class__(self):
+        return UuidScalar
+
+
 cdef class FixedShapeTensorType(BaseExtensionType):
     """
     Concrete class for fixed shape tensor extension type.
@@ -5208,6 +5227,21 @@ def run_end_encoded(run_end_type, value_type):
     return pyarrow_wrap_data_type(ree_type)
 
 
+def uuid():
+    """
+    Create UuidType instance.
+
+    Returns
+    -------
+    type : UuidType
+    """
+
+    cdef UuidType out = UuidType.__new__(UuidType)
+    c_uuid_ext_type = GetResultValue(CUuidType.Make())
+    out.init(c_uuid_ext_type)
+    return out
+
+
 def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=None):
     """
     Create instance of fixed shape tensor extension type with shape and optional

From 8eb7bd4115da0027aad6362f0fe0901ec44b0616 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 27 Aug 2024 09:12:57 +0900
Subject: [PATCH 073/157] MINOR: [Go] Bump github.com/hamba/avro/v2 from 2.24.1
 to 2.25.0 in /go (#43829)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [github.com/hamba/avro/v2](https://github.com/hamba/avro) from 2.24.1 to 2.25.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/hamba/avro/releases">github.com/hamba/avro/v2's releases</a>.</em></p>
<blockquote>
<h2>v2.25.0</h2>
<h2>What's Changed</h2>
<ul>
<li>chore: bump golang.org/x/tools from 0.23.0 to 0.24.0 in the all group by <a href="https://github.com/dependabot"><code>@​dependabot</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/431">hamba/avro#431</a></li>
<li>feat: support custom logical types by <a href="https://github.com/Emptyless"><code>@​Emptyless</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/429">hamba/avro#429</a></li>
<li>chore: support go 1.23 by <a href="https://github.com/nrwiersma"><code>@​nrwiersma</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/433">hamba/avro#433</a></li>
<li>docs: add who use case by <a href="https://github.com/haoxins"><code>@​haoxins</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/434">hamba/avro#434</a></li>
<li>chore: update decoder.go by <a href="https://github.com/kasperlewau"><code>@​kasperlewau</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/436">hamba/avro#436</a></li>
<li>fix: ref already seen schemas in deref walk by <a href="https://github.com/nrwiersma"><code>@​nrwiersma</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/438">hamba/avro#438</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/Emptyless"><code>@​Emptyless</code></a> made their first contribution in <a href="https://redirect.github.com/hamba/avro/pull/429">hamba/avro#429</a></li>
<li><a href="https://github.com/kasperlewau"><code>@​kasperlewau</code></a> made their first contribution in <a href="https://redirect.github.com/hamba/avro/pull/436">hamba/avro#436</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/hamba/avro/compare/v2.24.1...v2.24.2">https://github.com/hamba/avro/compare/v2.24.1...v2.24.2</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/hamba/avro/commit/c2ac60e27f027cecefd33df085cd7dd13ed2b4f5"><code>c2ac60e</code></a> fix: ref already seen schemas in deref walk (<a href="https://redirect.github.com/hamba/avro/issues/438">#438</a>)</li>
<li><a href="https://github.com/hamba/avro/commit/917a77ee07e81c28cc010d261115b8f86b7be234"><code>917a77e</code></a> chore: update decoder.go (<a href="https://redirect.github.com/hamba/avro/issues/436">#436</a>)</li>
<li><a href="https://github.com/hamba/avro/commit/3a276f315d119ea178a75b61e2ddf1d1d425f3bd"><code>3a276f3</code></a> docs: Add who use case (<a href="https://redirect.github.com/hamba/avro/issues/434">#434</a>)</li>
<li><a href="https://github.com/hamba/avro/commit/fdb7050201e160aaeacc303f1b4d26111f1d81c9"><code>fdb7050</code></a> chore: support go 1.23, remove go 1.21 (<a href="https://redirect.github.com/hamba/avro/issues/433">#433</a>)</li>
<li><a href="https://github.com/hamba/avro/commit/51c1d356a5f16b0d16de083d7811be8e6c92e66f"><code>51c1d35</code></a> feat: support custom logical types (<a href="https://redirect.github.com/hamba/avro/issues/429">#429</a>)</li>
<li><a href="https://github.com/hamba/avro/commit/2623a40ea9178daaf1ec6876c0906029bcf83827"><code>2623a40</code></a> chore: bump golang.org/x/tools from 0.23.0 to 0.24.0 in the all group (<a href="https://redirect.github.com/hamba/avro/issues/431">#431</a>)</li>
<li>See full diff in <a href="https://github.com/hamba/avro/compare/v2.24.1...v2.25.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/hamba/avro/v2&package-manager=go_modules&previous-version=2.24.1&new-version=2.25.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 go/go.mod | 2 +-
 go/go.sum | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/go/go.mod b/go/go.mod
index 9f4222a541b..97ac0568597 100644
--- a/go/go.mod
+++ b/go/go.mod
@@ -47,7 +47,7 @@ require (
 
 require (
 	github.com/google/uuid v1.6.0
-	github.com/hamba/avro/v2 v2.24.1
+	github.com/hamba/avro/v2 v2.25.0
 	github.com/huandu/xstrings v1.4.0
 	github.com/substrait-io/substrait-go v0.6.0
 	github.com/tidwall/sjson v1.2.5
diff --git a/go/go.sum b/go/go.sum
index c7eb3a66dee..bd761e15894 100644
--- a/go/go.sum
+++ b/go/go.sum
@@ -43,8 +43,8 @@ github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbu
 github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/hamba/avro/v2 v2.24.1 h1:Xi+7AnhaAc41aA/jmmYpxMsdEDOf1rdup6NJ85P7q2I=
-github.com/hamba/avro/v2 v2.24.1/go.mod h1:7vDfy/2+kYCE8WUHoj2et59GTv0ap7ptktMXu0QHePI=
+github.com/hamba/avro/v2 v2.25.0 h1:9qig/K4VP5tMq6DuKGfI6YdXncTkPJT1IJDMSv82EeI=
+github.com/hamba/avro/v2 v2.25.0/go.mod h1:I8glyswHnpED3Nlx2ZdUe+4LJnCOOyiCzLMno9i/Uu0=
 github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
 github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
 github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=

From 93c5ddb957bb93421a8f84dbd7c5a5b7be2d6d45 Mon Sep 17 00:00:00 2001
From: PANKAJ9768 <48675737+PANKAJ9768@users.noreply.github.com>
Date: Tue, 27 Aug 2024 05:59:09 +0530
Subject: [PATCH 074/157] GH-43667: [Java] Keeping Flight default header size
 consistent between server and client  (#43697)

### Rationale for this change

### What changes are included in this PR?
Flight client can send header size larger than server can accept. This PR is to keep default values consistent across server and client.

### Are these changes tested?

### Are there any user-facing changes?

* GitHub Issue: #43667

Authored-by: pankaj kesari <pankaj.kesari99@yahoo.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../org/apache/arrow/flight/FlightServer.java |  7 ++
 .../arrow/flight/TestFlightService.java       | 73 +++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java
index 05dbe42c491..ac761457f57 100644
--- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java
+++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java
@@ -188,6 +188,7 @@ public static final class Builder {
     private CallHeaderAuthenticator headerAuthenticator = CallHeaderAuthenticator.NO_OP;
     private ExecutorService executor = null;
     private int maxInboundMessageSize = MAX_GRPC_MESSAGE_SIZE;
+    private int maxHeaderListSize = MAX_GRPC_MESSAGE_SIZE;
     private int backpressureThreshold = DEFAULT_BACKPRESSURE_THRESHOLD;
     private InputStream certChain;
     private InputStream key;
@@ -324,6 +325,7 @@ public FlightServer build() {
       builder
           .executor(exec)
           .maxInboundMessageSize(maxInboundMessageSize)
+          .maxInboundMetadataSize(maxHeaderListSize)
           .addService(
               ServerInterceptors.intercept(
                   flightService,
@@ -366,6 +368,11 @@ public FlightServer build() {
       return new FlightServer(location, builder.build(), grpcExecutor);
     }
 
+    public Builder setMaxHeaderListSize(int maxHeaderListSize) {
+      this.maxHeaderListSize = maxHeaderListSize;
+      return this;
+    }
+
     /**
      * Set the maximum size of a message. Defaults to "unlimited", depending on the underlying
      * transport.
diff --git a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java
index 5ebeb44c1d3..fc3f83e4eaf 100644
--- a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java
+++ b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java
@@ -27,6 +27,7 @@
 import java.nio.charset.StandardCharsets;
 import java.util.Collections;
 import java.util.Optional;
+import java.util.Random;
 import org.apache.arrow.flight.impl.Flight;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.RootAllocator;
@@ -152,4 +153,76 @@ public FlightInfo getFlightInfo(CallContext context, FlightDescriptor descriptor
       assertEquals("No schema is present in FlightInfo", e.getMessage());
     }
   }
+
+  /**
+   * Test for GH-41584 where flight defaults for header size was not in sync b\w client and server.
+   */
+  @Test
+  public void testHeaderSizeExchangeInService() throws Exception {
+    final FlightProducer producer =
+        new NoOpFlightProducer() {
+          @Override
+          public FlightInfo getFlightInfo(CallContext context, FlightDescriptor descriptor) {
+            String longHeader =
+                context.getMiddleware(FlightConstants.HEADER_KEY).headers().get("long-header");
+            return new FlightInfo(
+                null,
+                descriptor,
+                Collections.emptyList(),
+                0,
+                0,
+                false,
+                IpcOption.DEFAULT,
+                longHeader.getBytes(StandardCharsets.UTF_8));
+          }
+        };
+
+    String headerVal = generateRandom(1024 * 10);
+    FlightCallHeaders callHeaders = new FlightCallHeaders();
+    callHeaders.insert("long-header", headerVal);
+    // sever with default header limit same as client
+    try (final FlightServer s =
+            FlightServer.builder(allocator, forGrpcInsecure(LOCALHOST, 0), producer)
+                .build()
+                .start();
+        final FlightClient client = FlightClient.builder(allocator, s.getLocation()).build()) {
+      FlightInfo flightInfo =
+          client.getInfo(FlightDescriptor.path("test"), new HeaderCallOption(callHeaders));
+      assertEquals(Optional.empty(), flightInfo.getSchemaOptional());
+      assertEquals(new Schema(Collections.emptyList()), flightInfo.getSchema());
+      assertArrayEquals(flightInfo.getAppMetadata(), headerVal.getBytes(StandardCharsets.UTF_8));
+    }
+    // server with 15kb header limit
+    try (final FlightServer s =
+            FlightServer.builder(allocator, forGrpcInsecure(LOCALHOST, 0), producer)
+                .setMaxHeaderListSize(1024 * 15)
+                .build()
+                .start();
+        final FlightClient client = FlightClient.builder(allocator, s.getLocation()).build()) {
+      FlightInfo flightInfo =
+          client.getInfo(FlightDescriptor.path("test"), new HeaderCallOption(callHeaders));
+      assertEquals(Optional.empty(), flightInfo.getSchemaOptional());
+      assertEquals(new Schema(Collections.emptyList()), flightInfo.getSchema());
+      assertArrayEquals(flightInfo.getAppMetadata(), headerVal.getBytes(StandardCharsets.UTF_8));
+
+      callHeaders.insert("another-header", headerVal + headerVal);
+      FlightRuntimeException e =
+          assertThrows(
+              FlightRuntimeException.class,
+              () ->
+                  client.getInfo(FlightDescriptor.path("test"), new HeaderCallOption(callHeaders)));
+      assertEquals("http2 exception", e.getMessage());
+    }
+  }
+
+  private static String generateRandom(int size) {
+    String aToZ = "ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890";
+    Random random = new Random();
+    StringBuilder res = new StringBuilder();
+    for (int i = 0; i < size; i++) {
+      int randIndex = random.nextInt(aToZ.length());
+      res.append(aToZ.charAt(randIndex));
+    }
+    return res.toString();
+  }
 }

From 11f92491b1d2ecf700e6e023a1e413ec4c4345ae Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 27 Aug 2024 11:06:13 +0900
Subject: [PATCH 075/157] MINOR: [Go] Bump github.com/substrait-io/substrait-go
 from 0.6.0 to 0.7.0 in /go (#43830)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [github.com/substrait-io/substrait-go](https://github.com/substrait-io/substrait-go) from 0.6.0 to 0.7.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/substrait-io/substrait-go/releases">github.com/substrait-io/substrait-go's releases</a>.</em></p>
<blockquote>
<h1>v0.7.0 (2024-08-25)</h1>
<h3>Features</h3>
<ul>
<li>Add convenience literal APIs (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/47">#47</a>) (<a href="https://github.com/substrait-io/substrait-go/commit/597afdb7059171990014b357fa5b0865428c034f">597afdb</a>)
<blockquote>
<ul>
<li>Introduce literal package</li>
</ul>
<hr />
</blockquote>
</li>
</ul>
<h3>Changes to the build process or auxiliary tools and libraries such as documentation generation</h3>
<ul>
<li><strong><code>extensions</code></strong> Minor refactoring in extension_mgr.go (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/45">#45</a>) (<a href="https://github.com/substrait-io/substrait-go/commit/cbd28cb19499af1923484ec82540350528249075">cbd28cb</a>)
<blockquote>
<ul>
<li>Minor refactoring in extension_mgr.go</li>
</ul>
</blockquote>
</li>
<li>Move typeName maps to types package (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/46">#46</a>) (<a href="https://github.com/substrait-io/substrait-go/commit/5556c236d4fce79681d3c9e7db9b543a8e4245ce">5556c23</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/substrait-io/substrait-go/commit/597afdb7059171990014b357fa5b0865428c034f"><code>597afdb</code></a> feat: Add convenience literal APIs (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/47">#47</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/e77df6728b1f9499d2f650a927074ffc1354a5df"><code>e77df67</code></a> feat(types) Make time precision value explicit (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/49">#49</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/a3e8ee0724d42061f76fe5c64eaece37ca468c8c"><code>a3e8ee0</code></a> feat(substrait) Update to substrait v0.55.0 (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/48">#48</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/2229c12e14ac23f631c19e9e8001d826715dccef"><code>2229c12</code></a> ci(build-test): golangci should use the go.mod version of golang (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/51">#51</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/cbd28cb19499af1923484ec82540350528249075"><code>cbd28cb</code></a> chore(extensions): Minor refactoring in extension_mgr.go (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/45">#45</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/5556c236d4fce79681d3c9e7db9b543a8e4245ce"><code>5556c23</code></a> chore: Move typeName maps to types package (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/46">#46</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/dd790cb46265074e7737d102675f790dbb3f2e56"><code>dd790cb</code></a> Add a function registry for a given BFT dialect  (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/32">#32</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/828636c51ea752cf7a34aa18e3336ac2c43fe3f4"><code>828636c</code></a> ci(build-test): Add golangci-lint to do import checking and other linting (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/42">#42</a>)</li>
<li>See full diff in <a href="https://github.com/substrait-io/substrait-go/compare/v0.6.0...v0.7.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/substrait-io/substrait-go&package-manager=go_modules&previous-version=0.6.0&new-version=0.7.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 go/go.mod | 2 +-
 go/go.sum | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/go/go.mod b/go/go.mod
index 97ac0568597..a995eee24d5 100644
--- a/go/go.mod
+++ b/go/go.mod
@@ -49,7 +49,7 @@ require (
 	github.com/google/uuid v1.6.0
 	github.com/hamba/avro/v2 v2.25.0
 	github.com/huandu/xstrings v1.4.0
-	github.com/substrait-io/substrait-go v0.6.0
+	github.com/substrait-io/substrait-go v0.7.0
 	github.com/tidwall/sjson v1.2.5
 )
 
diff --git a/go/go.sum b/go/go.sum
index bd761e15894..6f22e11aef0 100644
--- a/go/go.sum
+++ b/go/go.sum
@@ -99,8 +99,8 @@ github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
 github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
-github.com/substrait-io/substrait-go v0.6.0 h1:n2G/SGmrn7U5Q39VA8WeM2UfVL5Y/6HX8WAP9uJLNk4=
-github.com/substrait-io/substrait-go v0.6.0/go.mod h1:cl8Wsc7aBPDfcHp9+OrUqGpjkgrYlhcDsH/lMP6KUZA=
+github.com/substrait-io/substrait-go v0.7.0 h1:53yi73t4wW383+RD1YuhXhbjhP1KzF9GCxPC7SsRlqc=
+github.com/substrait-io/substrait-go v0.7.0/go.mod h1:7mjSvIaxk94bOF+YZn/vBOpHK4DWTpBv7nC/btjXCmc=
 github.com/tidwall/gjson v1.14.2 h1:6BBkirS0rAHjumnjHF6qgy5d2YAJ1TLIaFE2lzfOLqo=
 github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
 github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=

From a49493d96bc3021af1a126ce33f859bfb7a2ec80 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Tue, 27 Aug 2024 11:44:19 +0900
Subject: [PATCH 076/157] MINOR: [Java] Downgrade gRPC to 1.65 (#43839)

### Rationale for this change
Newer versions don't run in all CI pipelines due to protoc using a newer glibc.

### What changes are included in this PR?

This reverts commit 4af1e491df7ac22217656668b65c3e8d55f5b5ab.

### Are these changes tested?

N/A

### Are there any user-facing changes?

No

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 77feed12f3f..f78d02c0c65 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -97,7 +97,7 @@ under the License.
     <dep.slf4j.version>2.0.16</dep.slf4j.version>
     <dep.guava-bom.version>33.2.1-jre</dep.guava-bom.version>
     <dep.netty-bom.version>4.1.112.Final</dep.netty-bom.version>
-    <dep.grpc-bom.version>1.66.0</dep.grpc-bom.version>
+    <dep.grpc-bom.version>1.65.0</dep.grpc-bom.version>
     <dep.protobuf-bom.version>3.25.4</dep.protobuf-bom.version>
     <dep.jackson-bom.version>2.17.2</dep.jackson-bom.version>
     <dep.hadoop.version>3.4.0</dep.hadoop.version>

From 23fe1ce3361b9a6825fea77deb20d0bd7f247fe2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 27 Aug 2024 11:56:45 +0900
Subject: [PATCH 077/157] MINOR: [Java] Bump
 org.apache.commons:commons-compress from 1.27.0 to 1.27.1 in /java (#43826)

Bumps org.apache.commons:commons-compress from 1.27.0 to 1.27.1.

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.commons:commons-compress&package-manager=maven&previous-version=1.27.0&new-version=1.27.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/compression/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/compression/pom.xml b/java/compression/pom.xml
index a1f2bc861da..46ed8796423 100644
--- a/java/compression/pom.xml
+++ b/java/compression/pom.xml
@@ -50,7 +50,7 @@ under the License.
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-compress</artifactId>
-      <version>1.27.0</version>
+      <version>1.27.1</version>
     </dependency>
     <dependency>
       <groupId>com.github.luben</groupId>

From fa5d158282b316819e4e23e0903b696467a61d38 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 21:01:45 -0700
Subject: [PATCH 078/157] MINOR: [C#] Bump Microsoft.NET.Test.Sdk from 17.10.0
 to 17.11.0 in /csharp (#43822)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [Microsoft.NET.Test.Sdk](https://github.com/microsoft/vstest) from 17.10.0 to 17.11.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/microsoft/vstest/releases">Microsoft.NET.Test.Sdk's releases</a>.</em></p>
<blockquote>
<h2>v17.11.0</h2>
<h2>What's Changed</h2>
<ul>
<li>Add reference to the AdapterUtilities library in the spec docs. by <a href="https://github.com/peterwald"><code>@​peterwald</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4958">microsoft/vstest#4958</a></li>
<li>Stack trace when localized, and new messages by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4944">microsoft/vstest#4944</a></li>
<li>Fix single quote and space in F# pretty methods by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4969">microsoft/vstest#4969</a></li>
<li>Update .NET runtimes to latest patch version by <a href="https://github.com/Evangelink"><code>@​Evangelink</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4975">microsoft/vstest#4975</a></li>
<li>Update dotnetcoretests.md by <a href="https://github.com/DickBaker"><code>@​DickBaker</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4977">microsoft/vstest#4977</a></li>
<li>Add list of known TestingPlatform dlls by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4983">microsoft/vstest#4983</a></li>
<li>Update framework version used for testing, and test matrix by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4970">microsoft/vstest#4970</a></li>
<li>Add output forwarding for .NET by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4988">microsoft/vstest#4988</a></li>
<li>Remove usage of pt images before decomissioning by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4994">microsoft/vstest#4994</a></li>
<li>chore: Add more details to acquistion section. by <a href="https://github.com/voroninp"><code>@​voroninp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4999">microsoft/vstest#4999</a></li>
<li>Simplify banner by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5013">microsoft/vstest#5013</a></li>
<li>Forward standard output of testhost by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4998">microsoft/vstest#4998</a></li>
<li>Add missing copyright header by <a href="https://github.com/MichaelSimons"><code>@​MichaelSimons</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5020">microsoft/vstest#5020</a></li>
<li>Add option to not share .NET Framework testhosts by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5018">microsoft/vstest#5018</a></li>
<li>GetTypesToLoad Attribute cant be null by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5054">microsoft/vstest#5054</a></li>
<li>rawArgument in GetArgumentList cant be null by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5056">microsoft/vstest#5056</a></li>
<li>fix Atribute typo by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5057">microsoft/vstest#5057</a></li>
<li>remove unnecessary list alloc for 2 scenarios in TestRequestManager.GetSources by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5058">microsoft/vstest#5058</a></li>
<li>fix incompatiblity typo by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5059">microsoft/vstest#5059</a></li>
<li>remove redundant inline method in IsPlatformIncompatible by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5060">microsoft/vstest#5060</a></li>
<li>fix Sucess typo by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5061">microsoft/vstest#5061</a></li>
<li>use some null coalescing by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5062">microsoft/vstest#5062</a></li>
<li>Add cts into friends of TranslationLayer by <a href="https://github.com/jakubch1"><code>@​jakubch1</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5075">microsoft/vstest#5075</a></li>
<li>Use built in sha1 for id generation by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5081">microsoft/vstest#5081</a></li>
<li>All output in terminal logger by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5083">microsoft/vstest#5083</a></li>
<li>Ignore env test by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5095">microsoft/vstest#5095</a></li>
<li>Dispose XmlReader in XmlRunSettingsUtilities by <a href="https://github.com/omajid"><code>@​omajid</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5094">microsoft/vstest#5094</a></li>
<li>Bump to macos-12 build image by <a href="https://github.com/akoeplinger"><code>@​akoeplinger</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5101">microsoft/vstest#5101</a></li>
<li>Handle ansi escape in terminal logger reporter by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5084">microsoft/vstest#5084</a></li>
<li>remove disable interactive auth by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5110">microsoft/vstest#5110</a></li>
<li>Error output as info in terminal logger by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5113">microsoft/vstest#5113</a></li>
<li>Write dll instead of target on abort, rename errors by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5115">microsoft/vstest#5115</a></li>
<li>
<ul>
<li>[rel/17.11] Update dependencies from devdiv/DevDiv/vs-code-coverage by <a href="https://github.com/dotnet-maestro"><code>@​dotnet-maestro</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5152">microsoft/vstest#5152</a></li>
</ul>
</li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/peterwald"><code>@​peterwald</code></a> made their first contribution in <a href="https://redirect.github.com/microsoft/vstest/pull/4958">microsoft/vstest#4958</a></li>
<li><a href="https://github.com/DickBaker"><code>@​DickBaker</code></a> made their first contribution in <a href="https://redirect.github.com/microsoft/vstest/pull/4977">microsoft/vstest#4977</a></li>
<li><a href="https://github.com/voroninp"><code>@​voroninp</code></a> made their first contribution in <a href="https://redirect.github.com/microsoft/vstest/pull/4999">microsoft/vstest#4999</a></li>
<li><a href="https://github.com/akoeplinger"><code>@​akoeplinger</code></a> made their first contribution in <a href="https://redirect.github.com/microsoft/vstest/pull/5101">microsoft/vstest#5101</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/microsoft/vstest/compare/v17.10.0...v17.11.0-release-24352-06">https://github.com/microsoft/vstest/compare/v17.10.0...v17.11.0-release-24352-06</a></p>
<h2>v17.11.0-release-24373-02</h2>
<h2>What's Changed</h2>
<ul>
<li>[rel/17.11] Update dependencies from devdiv/DevDiv/vs-code-coverage by <a href="https://github.com/dotnet-maestro"><code>@​dotnet-maestro</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5152">microsoft/vstest#5152</a></li>
</ul>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/microsoft/vstest/commit/c6ad3e3fa4120fb32c8a48bab4fa478adfdb2740"><code>c6ad3e3</code></a> Update dependencies from <a href="https://dev.azure.com/devdiv/DevDiv/_git/vs-code-cov">https://dev.azure.com/devdiv/DevDiv/_git/vs-code-cov</a>...</li>
<li><a href="https://github.com/microsoft/vstest/commit/910ca0dcc779068418464794f5af570eda195222"><code>910ca0d</code></a> Fix output based test (<a href="https://redirect.github.com/microsoft/vstest/issues/5131">#5131</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/0518ceaee8e9b3689ebf0de5f250eb2a2e9de1c1"><code>0518cea</code></a> Rebrand to 17.11-release (<a href="https://redirect.github.com/microsoft/vstest/issues/5128">#5128</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/5b69fd31169dd07ced917329bbb483f3b73ea98f"><code>5b69fd3</code></a> Write dll instead of target on abort, rename errors (<a href="https://redirect.github.com/microsoft/vstest/issues/5115">#5115</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/7264afa0720d846bc4d64efaf5ebe9587e071ca7"><code>7264afa</code></a> Error output as info in terminal logger (<a href="https://redirect.github.com/microsoft/vstest/issues/5113">#5113</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/3d2ea06c998a002f640b01f5d84fdefb14167502"><code>3d2ea06</code></a> remove disable interactive auth (<a href="https://redirect.github.com/microsoft/vstest/issues/5110">#5110</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/e4da2c15416e898d665f2b41bd3939b49e20859a"><code>e4da2c1</code></a> Add option to ignore tests (<a href="https://redirect.github.com/microsoft/vstest/issues/5109">#5109</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/6b3b95952d3e8c31259536fe2d7d2c0530a90347"><code>6b3b959</code></a> Ignore dump failing test while I investigate (<a href="https://redirect.github.com/microsoft/vstest/issues/5107">#5107</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/24b992fda379b2443b8b986d1c146df4d7d7e14d"><code>24b992f</code></a> Ignore dispose error (<a href="https://redirect.github.com/microsoft/vstest/issues/5105">#5105</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/bfdaf0bfd7727b509c6f509c4736ee9d685c794b"><code>bfdaf0b</code></a> Object disposed flaky (<a href="https://redirect.github.com/microsoft/vstest/issues/5104">#5104</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/microsoft/vstest/compare/v17.10.0...v17.11.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Microsoft.NET.Test.Sdk&package-manager=nuget&previous-version=17.10.0&new-version=17.11.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../Apache.Arrow.Compression.Tests.csproj                       | 2 +-
 .../Apache.Arrow.Flight.Sql.Tests.csproj                        | 2 +-
 .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj  | 2 +-
 csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj
index 047cdb94b96..4ea02e0ed21 100644
--- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj
@@ -7,7 +7,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.10.0" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
     <PackageReference Include="xunit" Version="2.9.0" />
     <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2" />
   </ItemGroup>
diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj
index dc95f9edf9f..fd8274230ec 100644
--- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj
@@ -6,7 +6,7 @@
     </PropertyGroup>
 
     <ItemGroup>
-      <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.10.0" />
+      <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
       <PackageReference Include="xunit" Version="2.9.0" />
       <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2" />
       <PackageReference Include="coverlet.collector" Version="6.0.2" />
diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj
index e68a97670cc..eae9ab746f2 100644
--- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj
@@ -6,7 +6,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.10.0" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
     <PackageReference Include="xunit" Version="2.9.0" />
     <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2" />
     <PackageReference Include="coverlet.collector" Version="6.0.2" />
diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
index f0533831306..ee71b203218 100644
--- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
@@ -16,7 +16,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.10.0" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
     <PackageReference Include="xunit" Version="2.9.0" />
     <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2">
       <PrivateAssets>all</PrivateAssets>

From c30bb6a84536d66bc1179e2a051915d5c34b2616 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 27 Aug 2024 14:49:45 +0900
Subject: [PATCH 079/157] GH-41056: [GLib][FlightRPC] Add
 gaflight_client_do_put() and related APIs (#43813)

### Rationale for this change

DoPut is needed to upload data.

### What changes are included in this PR?

* Add `gaflight_client_do_put()`
* Add `GAFlightStreamWriter`
* Add `GAFlightMetadataReader`
* Add `GAFlightDoPutResult`
* Fix `GAFlightRecordBatchWriter` API

### Are these changes tested?

No. They aren't tested yet. We will add tests when we implement server side DoPut.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #41056

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/arrow-flight-glib/client.cpp | 337 +++++++++++++++++++++++++++-
 c_glib/arrow-flight-glib/client.h   |  46 ++++
 c_glib/arrow-flight-glib/client.hpp |  16 ++
 c_glib/arrow-flight-glib/common.cpp | 102 ++-------
 c_glib/arrow-flight-glib/common.h   |   8 +-
 c_glib/arrow-glib/writer.hpp        |   4 +
 6 files changed, 421 insertions(+), 92 deletions(-)

diff --git a/c_glib/arrow-flight-glib/client.cpp b/c_glib/arrow-flight-glib/client.cpp
index 80c47e336f8..23f59c9da69 100644
--- a/c_glib/arrow-flight-glib/client.cpp
+++ b/c_glib/arrow-flight-glib/client.cpp
@@ -33,10 +33,19 @@ G_BEGIN_DECLS
  * #GAFlightStreamReader is a class for reading record batches from a
  * server.
  *
+ * #GAFlightStreamWriter is a class for writing record batches to a
+ * server.
+ *
+ * #GAFlightMetadataReader is a class for reading metadata from a
+ * server.
+ *
  * #GAFlightCallOptions is a class for options of each call.
  *
  * #GAFlightClientOptions is a class for options of each client.
  *
+ * #GAFlightDoPutResult is a class that has gaflight_client_do_put()
+ * result.
+ *
  * #GAFlightClient is a class for Apache Arrow Flight client.
  *
  * Since: 5.0.0
@@ -56,6 +65,128 @@ gaflight_stream_reader_class_init(GAFlightStreamReaderClass *klass)
 {
 }
 
+G_DEFINE_TYPE(GAFlightStreamWriter,
+              gaflight_stream_writer,
+              GAFLIGHT_TYPE_RECORD_BATCH_WRITER)
+
+static void
+gaflight_stream_writer_init(GAFlightStreamWriter *object)
+{
+}
+
+static void
+gaflight_stream_writer_class_init(GAFlightStreamWriterClass *klass)
+{
+}
+
+/**
+ * gaflight_stream_writer_done_writing:
+ * @writer: A #GAFlightStreamWriter.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gaflight_stream_writer_done_writing(GAFlightStreamWriter *writer, GError **error)
+{
+  auto flight_writer = std::static_pointer_cast<arrow::flight::FlightStreamWriter>(
+    garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer)));
+  return garrow::check(error,
+                       flight_writer->DoneWriting(),
+                       "[flight-stream-writer][done-writing]");
+}
+
+struct GAFlightMetadataReaderPrivate
+{
+  arrow::flight::FlightMetadataReader *reader;
+};
+
+enum {
+  PROP_METADATA_READER_READER = 1,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GAFlightMetadataReader,
+                           gaflight_metadata_reader,
+                           G_TYPE_OBJECT)
+
+#define GAFLIGHT_METADATA_READER_GET_PRIVATE(object)                                     \
+  static_cast<GAFlightMetadataReaderPrivate *>(                                          \
+    gaflight_metadata_reader_get_instance_private(GAFLIGHT_METADATA_READER(object)))
+
+static void
+gaflight_metadata_reader_finalize(GObject *object)
+{
+  auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(object);
+  delete priv->reader;
+  G_OBJECT_CLASS(gaflight_metadata_reader_parent_class)->finalize(object);
+}
+
+static void
+gaflight_metadata_reader_set_property(GObject *object,
+                                      guint prop_id,
+                                      const GValue *value,
+                                      GParamSpec *pspec)
+{
+  auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_METADATA_READER_READER:
+    priv->reader =
+      static_cast<arrow::flight::FlightMetadataReader *>(g_value_get_pointer(value));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gaflight_metadata_reader_init(GAFlightMetadataReader *object)
+{
+}
+
+static void
+gaflight_metadata_reader_class_init(GAFlightMetadataReaderClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->finalize = gaflight_metadata_reader_finalize;
+  gobject_class->set_property = gaflight_metadata_reader_set_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer(
+    "reader",
+    nullptr,
+    nullptr,
+    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_METADATA_READER_READER, spec);
+}
+
+/**
+ * gaflight_metadata_reader_read:
+ * @reader: A #GAFlightMetadataReader.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (transfer full): The metadata on success, %NULL on error.
+ *
+ * Since: 18.0.0
+ */
+GArrowBuffer *
+gaflight_metadata_reader_read(GAFlightMetadataReader *reader, GError **error)
+{
+  auto flight_reader = gaflight_metadata_reader_get_raw(reader);
+  std::shared_ptr<arrow::Buffer> metadata;
+  if (garrow::check(error,
+                    flight_reader->ReadMetadata(&metadata),
+                    "[flight-metadata-reader][read]")) {
+    return garrow_buffer_new_raw(&metadata);
+  } else {
+    return nullptr;
+  }
+}
+
 typedef struct GAFlightCallOptionsPrivate_
 {
   arrow::flight::FlightCallOptions options;
@@ -385,6 +516,137 @@ gaflight_client_options_new(void)
     g_object_new(GAFLIGHT_TYPE_CLIENT_OPTIONS, NULL));
 }
 
+struct GAFlightDoPutResultPrivate
+{
+  GAFlightStreamWriter *writer;
+  GAFlightMetadataReader *reader;
+};
+
+enum {
+  PROP_DO_PUT_RESULT_RESULT = 1,
+  PROP_DO_PUT_RESULT_WRITER,
+  PROP_DO_PUT_RESULT_READER,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GAFlightDoPutResult, gaflight_do_put_result, G_TYPE_OBJECT)
+
+#define GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object)                                       \
+  static_cast<GAFlightDoPutResultPrivate *>(                                             \
+    gaflight_do_put_result_get_instance_private(GAFLIGHT_DO_PUT_RESULT(object)))
+
+static void
+gaflight_do_put_result_dispose(GObject *object)
+{
+  auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object);
+
+  if (priv->writer) {
+    g_object_unref(priv->writer);
+    priv->writer = nullptr;
+  }
+
+  if (priv->reader) {
+    g_object_unref(priv->reader);
+    priv->reader = nullptr;
+  }
+
+  G_OBJECT_CLASS(gaflight_do_put_result_parent_class)->dispose(object);
+}
+
+static void
+gaflight_do_put_result_init(GAFlightDoPutResult *object)
+{
+}
+
+static void
+gaflight_do_put_result_set_property(GObject *object,
+                                    guint prop_id,
+                                    const GValue *value,
+                                    GParamSpec *pspec)
+{
+  auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_DO_PUT_RESULT_RESULT:
+    {
+      auto result = static_cast<arrow::flight::FlightClient::DoPutResult *>(
+        g_value_get_pointer(value));
+      priv->writer = gaflight_stream_writer_new_raw(result->writer.release());
+      priv->reader = gaflight_metadata_reader_new_raw(result->reader.release());
+      break;
+    }
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gaflight_do_put_result_get_property(GObject *object,
+                                    guint prop_id,
+                                    GValue *value,
+                                    GParamSpec *pspec)
+{
+  auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_DO_PUT_RESULT_WRITER:
+    g_value_set_object(value, priv->writer);
+    break;
+  case PROP_DO_PUT_RESULT_READER:
+    g_value_set_object(value, priv->reader);
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gaflight_do_put_result_class_init(GAFlightDoPutResultClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->dispose = gaflight_do_put_result_dispose;
+  gobject_class->set_property = gaflight_do_put_result_set_property;
+  gobject_class->get_property = gaflight_do_put_result_get_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer(
+    "result",
+    nullptr,
+    nullptr,
+    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_RESULT, spec);
+
+  /**
+   * GAFlightDoPutResult:writer:
+   *
+   * A writer to write record batches to.
+   *
+   * Since: 18.0.0
+   */
+  spec = g_param_spec_object("writer",
+                             nullptr,
+                             nullptr,
+                             GAFLIGHT_TYPE_STREAM_WRITER,
+                             static_cast<GParamFlags>(G_PARAM_READABLE));
+  g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_WRITER, spec);
+
+  /**
+   * GAFlightDoPutResult:reader:
+   *
+   * A reader for application metadata from the server.
+   *
+   * Since: 18.0.0
+   */
+  spec = g_param_spec_object("reader",
+                             nullptr,
+                             nullptr,
+                             GAFLIGHT_TYPE_METADATA_READER,
+                             static_cast<GParamFlags>(G_PARAM_READABLE));
+  g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_READER, spec);
+}
+
 struct GAFlightClientPrivate
 {
   std::shared_ptr<arrow::flight::FlightClient> client;
@@ -661,6 +923,51 @@ gaflight_client_do_get(GAFlightClient *client,
   return gaflight_stream_reader_new_raw(flight_reader.release(), TRUE);
 }
 
+/**
+ * gaflight_client_do_put:
+ * @client: A #GAFlightClient.
+ * @descriptor: A #GAFlightDescriptor.
+ * @schema: A #GArrowSchema.
+ * @options: (nullable): A #GAFlightCallOptions.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Upload data to a Flight described by the given descriptor. The
+ * caller must call garrow_record_batch_writer_close() on the
+ * returned stream once they are done writing.
+ *
+ * The reader and writer are linked; closing the writer will also
+ * close the reader. Use garrow_flight_stream_writer_done_writing() to
+ * only close the write side of the channel.
+ *
+ * Returns: (nullable) (transfer full):
+ *   The #GAFlighDoPutResult holding a reader and a writer on success,
+ *   %NULL on error.
+ *
+ * Since: 18.0.0
+ */
+GAFlightDoPutResult *
+gaflight_client_do_put(GAFlightClient *client,
+                       GAFlightDescriptor *descriptor,
+                       GArrowSchema *schema,
+                       GAFlightCallOptions *options,
+                       GError **error)
+{
+  auto flight_client = gaflight_client_get_raw(client);
+  auto flight_descriptor = gaflight_descriptor_get_raw(descriptor);
+  auto arrow_schema = garrow_schema_get_raw(schema);
+  arrow::flight::FlightCallOptions flight_default_options;
+  auto flight_options = &flight_default_options;
+  if (options) {
+    flight_options = gaflight_call_options_get_raw(options);
+  }
+  auto result = flight_client->DoPut(*flight_options, *flight_descriptor, arrow_schema);
+  if (!garrow::check(error, result, "[flight-client][do-put]")) {
+    return nullptr;
+  }
+  auto flight_result = std::move(*result);
+  return gaflight_do_put_result_new_raw(&flight_result);
+}
+
 G_END_DECLS
 
 GAFlightStreamReader *
@@ -672,7 +979,28 @@ gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader,
                                              flight_reader,
                                              "is-owner",
                                              is_owner,
-                                             NULL));
+                                             nullptr));
+}
+
+GAFlightStreamWriter *
+gaflight_stream_writer_new_raw(arrow::flight::FlightStreamWriter *flight_writer)
+{
+  return GAFLIGHT_STREAM_WRITER(
+    g_object_new(GAFLIGHT_TYPE_STREAM_WRITER, "writer", flight_writer, nullptr));
+}
+
+GAFlightMetadataReader *
+gaflight_metadata_reader_new_raw(arrow::flight::FlightMetadataReader *flight_reader)
+{
+  return GAFLIGHT_METADATA_READER(
+    g_object_new(GAFLIGHT_TYPE_METADATA_READER, "reader", flight_reader, nullptr));
+}
+
+arrow::flight::FlightMetadataReader *
+gaflight_metadata_reader_get_raw(GAFlightMetadataReader *reader)
+{
+  auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(reader);
+  return priv->reader;
 }
 
 arrow::flight::FlightCallOptions *
@@ -689,6 +1017,13 @@ gaflight_client_options_get_raw(GAFlightClientOptions *options)
   return &(priv->options);
 }
 
+GAFlightDoPutResult *
+gaflight_do_put_result_new_raw(arrow::flight::FlightClient::DoPutResult *flight_result)
+{
+  return GAFLIGHT_DO_PUT_RESULT(
+    g_object_new(GAFLIGHT_TYPE_DO_PUT_RESULT, "result", flight_result, nullptr));
+}
+
 std::shared_ptr<arrow::flight::FlightClient>
 gaflight_client_get_raw(GAFlightClient *client)
 {
diff --git a/c_glib/arrow-flight-glib/client.h b/c_glib/arrow-flight-glib/client.h
index a91bbe55e3c..12c5a06b810 100644
--- a/c_glib/arrow-flight-glib/client.h
+++ b/c_glib/arrow-flight-glib/client.h
@@ -35,6 +35,35 @@ struct _GAFlightStreamReaderClass
   GAFlightRecordBatchReaderClass parent_class;
 };
 
+#define GAFLIGHT_TYPE_STREAM_WRITER (gaflight_stream_writer_get_type())
+GAFLIGHT_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(GAFlightStreamWriter,
+                         gaflight_stream_writer,
+                         GAFLIGHT,
+                         STREAM_WRITER,
+                         GAFlightRecordBatchWriter)
+struct _GAFlightStreamWriterClass
+{
+  GAFlightRecordBatchWriterClass parent_class;
+};
+
+GAFLIGHT_AVAILABLE_IN_18_0
+gboolean
+gaflight_stream_writer_done_writing(GAFlightStreamWriter *writer, GError **error);
+
+#define GAFLIGHT_TYPE_METADATA_READER (gaflight_metadata_reader_get_type())
+GAFLIGHT_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(
+  GAFlightMetadataReader, gaflight_metadata_reader, GAFLIGHT, METADATA_READER, GObject)
+struct _GAFlightMetadataReaderClass
+{
+  GObjectClass parent_class;
+};
+
+GAFLIGHT_AVAILABLE_IN_18_0
+GArrowBuffer *
+gaflight_metadata_reader_read(GAFlightMetadataReader *reader, GError **error);
+
 #define GAFLIGHT_TYPE_CALL_OPTIONS (gaflight_call_options_get_type())
 GAFLIGHT_AVAILABLE_IN_5_0
 G_DECLARE_DERIVABLE_TYPE(
@@ -75,6 +104,15 @@ GAFLIGHT_AVAILABLE_IN_5_0
 GAFlightClientOptions *
 gaflight_client_options_new(void);
 
+#define GAFLIGHT_TYPE_DO_PUT_RESULT (gaflight_do_put_result_get_type())
+GAFLIGHT_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(
+  GAFlightDoPutResult, gaflight_do_put_result, GAFLIGHT, DO_PUT_RESULT, GObject)
+struct _GAFlightDoPutResultClass
+{
+  GObjectClass parent_class;
+};
+
 #define GAFLIGHT_TYPE_CLIENT (gaflight_client_get_type())
 GAFLIGHT_AVAILABLE_IN_5_0
 G_DECLARE_DERIVABLE_TYPE(GAFlightClient, gaflight_client, GAFLIGHT, CLIENT, GObject)
@@ -124,4 +162,12 @@ gaflight_client_do_get(GAFlightClient *client,
                        GAFlightCallOptions *options,
                        GError **error);
 
+GAFLIGHT_AVAILABLE_IN_18_0
+GAFlightDoPutResult *
+gaflight_client_do_put(GAFlightClient *client,
+                       GAFlightDescriptor *descriptor,
+                       GArrowSchema *schema,
+                       GAFlightCallOptions *options,
+                       GError **error);
+
 G_END_DECLS
diff --git a/c_glib/arrow-flight-glib/client.hpp b/c_glib/arrow-flight-glib/client.hpp
index 185a28e6dc4..888f87ecb57 100644
--- a/c_glib/arrow-flight-glib/client.hpp
+++ b/c_glib/arrow-flight-glib/client.hpp
@@ -28,6 +28,18 @@ GAFlightStreamReader *
 gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader,
                                gboolean is_owner);
 
+GAFLIGHT_EXTERN
+GAFlightStreamWriter *
+gaflight_stream_writer_new_raw(arrow::flight::FlightStreamWriter *flight_writer);
+
+GAFLIGHT_EXTERN
+GAFlightMetadataReader *
+gaflight_metadata_reader_new_raw(arrow::flight::FlightMetadataReader *flight_reader);
+
+GAFLIGHT_EXTERN
+arrow::flight::FlightMetadataReader *
+gaflight_metadata_reader_get_raw(GAFlightMetadataReader *reader);
+
 GAFLIGHT_EXTERN
 arrow::flight::FlightCallOptions *
 gaflight_call_options_get_raw(GAFlightCallOptions *options);
@@ -36,6 +48,10 @@ GAFLIGHT_EXTERN
 arrow::flight::FlightClientOptions *
 gaflight_client_options_get_raw(GAFlightClientOptions *options);
 
+GAFLIGHT_EXTERN
+GAFlightDoPutResult *
+gaflight_do_put_result_new_raw(arrow::flight::FlightClient::DoPutResult *flight_result);
+
 GAFLIGHT_EXTERN
 std::shared_ptr<arrow::flight::FlightClient>
 gaflight_client_get_raw(GAFlightClient *client);
diff --git a/c_glib/arrow-flight-glib/common.cpp b/c_glib/arrow-flight-glib/common.cpp
index f7eea08c264..3deaf67cc14 100644
--- a/c_glib/arrow-flight-glib/common.cpp
+++ b/c_glib/arrow-flight-glib/common.cpp
@@ -1196,7 +1196,7 @@ gaflight_record_batch_reader_finalize(GObject *object)
   if (priv->is_owner) {
     delete priv->reader;
   }
-  G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object);
+  G_OBJECT_CLASS(gaflight_record_batch_reader_parent_class)->finalize(object);
 }
 
 static void
@@ -1300,57 +1300,9 @@ gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, GError
   }
 }
 
-typedef struct GAFlightRecordBatchWriterPrivate_
-{
-  arrow::flight::MetadataRecordBatchWriter *writer;
-  bool is_owner;
-} GAFlightRecordBatchWriterPrivate;
-
-enum {
-  PROP_RECORD_BATCH_WRITER_WRITER = 1,
-  PROP_RECORD_BATCH_WRITER_IS_OWNER,
-};
-
-G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GAFlightRecordBatchWriter,
-                                    gaflight_record_batch_writer,
-                                    GARROW_TYPE_RECORD_BATCH_WRITER)
-
-#define GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object)                                 \
-  static_cast<GAFlightRecordBatchWriterPrivate *>(                                       \
-    gaflight_record_batch_writer_get_instance_private(                                   \
-      GAFLIGHT_RECORD_BATCH_WRITER(object)))
-
-static void
-gaflight_record_batch_writer_finalize(GObject *object)
-{
-  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object);
-  if (priv->is_owner) {
-    delete priv->writer;
-  }
-  G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object);
-}
-
-static void
-gaflight_record_batch_writer_set_property(GObject *object,
-                                          guint prop_id,
-                                          const GValue *value,
-                                          GParamSpec *pspec)
-{
-  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object);
-
-  switch (prop_id) {
-  case PROP_RECORD_BATCH_WRITER_WRITER:
-    priv->writer =
-      static_cast<arrow::flight::MetadataRecordBatchWriter *>(g_value_get_pointer(value));
-    break;
-  case PROP_RECORD_BATCH_WRITER_IS_OWNER:
-    priv->is_owner = g_value_get_boolean(value);
-    break;
-  default:
-    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
-    break;
-  }
-}
+G_DEFINE_ABSTRACT_TYPE(GAFlightRecordBatchWriter,
+                       gaflight_record_batch_writer,
+                       GARROW_TYPE_RECORD_BATCH_WRITER)
 
 static void
 gaflight_record_batch_writer_init(GAFlightRecordBatchWriter *object)
@@ -1360,26 +1312,6 @@ gaflight_record_batch_writer_init(GAFlightRecordBatchWriter *object)
 static void
 gaflight_record_batch_writer_class_init(GAFlightRecordBatchWriterClass *klass)
 {
-  auto gobject_class = G_OBJECT_CLASS(klass);
-
-  gobject_class->finalize = gaflight_record_batch_writer_finalize;
-  gobject_class->set_property = gaflight_record_batch_writer_set_property;
-
-  GParamSpec *spec;
-  spec = g_param_spec_pointer(
-    "writer",
-    nullptr,
-    nullptr,
-    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_WRITER, spec);
-
-  spec = g_param_spec_boolean(
-    "is-owner",
-    nullptr,
-    nullptr,
-    TRUE,
-    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_IS_OWNER, spec);
 }
 
 /**
@@ -1402,7 +1334,8 @@ gaflight_record_batch_writer_begin(GAFlightRecordBatchWriter *writer,
                                    GArrowWriteOptions *options,
                                    GError **error)
 {
-  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto flight_writer = std::static_pointer_cast<arrow::flight::MetadataRecordBatchWriter>(
+    garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer)));
   auto arrow_schema = garrow_schema_get_raw(schema);
   arrow::ipc::IpcWriteOptions arrow_write_options;
   if (options) {
@@ -1432,7 +1365,8 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
                                             GArrowBuffer *metadata,
                                             GError **error)
 {
-  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto flight_writer = std::static_pointer_cast<arrow::flight::MetadataRecordBatchWriter>(
+    garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer)));
   auto arrow_metadata = garrow_buffer_get_raw(metadata);
   return garrow::check(error,
                        flight_writer->WriteMetadata(arrow_metadata),
@@ -1440,7 +1374,7 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
 }
 
 /**
- * gaflight_record_batch_writer_write:
+ * gaflight_record_batch_writer_write_record_batch:
  * @writer: A #GAFlightRecordBatchWriter.
  * @record_batch: A #GArrowRecordBatch.
  * @metadata: (nullable): A #GArrowBuffer.
@@ -1453,12 +1387,13 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
  * Since: 18.0.0
  */
 gboolean
-gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer,
-                                   GArrowRecordBatch *record_batch,
-                                   GArrowBuffer *metadata,
-                                   GError **error)
+gaflight_record_batch_writer_write_record_batch(GAFlightRecordBatchWriter *writer,
+                                                GArrowRecordBatch *record_batch,
+                                                GArrowBuffer *metadata,
+                                                GError **error)
 {
-  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto flight_writer = std::static_pointer_cast<arrow::flight::MetadataRecordBatchWriter>(
+    garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer)));
   auto arrow_record_batch = garrow_record_batch_get_raw(record_batch);
   auto arrow_metadata = garrow_buffer_get_raw(metadata);
   return garrow::check(
@@ -1599,10 +1534,3 @@ gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader)
   auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(reader);
   return priv->reader;
 }
-
-arrow::flight::MetadataRecordBatchWriter *
-gaflight_record_batch_writer_get_raw(GAFlightRecordBatchWriter *writer)
-{
-  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(writer);
-  return priv->writer;
-}
diff --git a/c_glib/arrow-flight-glib/common.h b/c_glib/arrow-flight-glib/common.h
index 91c828caabb..726132fe492 100644
--- a/c_glib/arrow-flight-glib/common.h
+++ b/c_glib/arrow-flight-glib/common.h
@@ -259,9 +259,9 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
 
 GAFLIGHT_AVAILABLE_IN_18_0
 gboolean
-gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer,
-                                   GArrowRecordBatch *record_batch,
-                                   GArrowBuffer *metadata,
-                                   GError **error);
+gaflight_record_batch_writer_write_record_batch(GAFlightRecordBatchWriter *writer,
+                                                GArrowRecordBatch *record_batch,
+                                                GArrowBuffer *metadata,
+                                                GError **error);
 
 G_END_DECLS
diff --git a/c_glib/arrow-glib/writer.hpp b/c_glib/arrow-glib/writer.hpp
index aa87ffe77d7..1d85ac52f88 100644
--- a/c_glib/arrow-glib/writer.hpp
+++ b/c_glib/arrow-glib/writer.hpp
@@ -25,16 +25,20 @@
 
 #include <arrow-glib/writer.h>
 
+GARROW_AVAILABLE_IN_ALL
 GArrowRecordBatchWriter *
 garrow_record_batch_writer_new_raw(
   std::shared_ptr<arrow::ipc::RecordBatchWriter> *arrow_writer);
+GARROW_AVAILABLE_IN_ALL
 std::shared_ptr<arrow::ipc::RecordBatchWriter>
 garrow_record_batch_writer_get_raw(GArrowRecordBatchWriter *writer);
 
+GARROW_AVAILABLE_IN_ALL
 GArrowRecordBatchStreamWriter *
 garrow_record_batch_stream_writer_new_raw(
   std::shared_ptr<arrow::ipc::RecordBatchWriter> *arrow_writer);
 
+GARROW_AVAILABLE_IN_ALL
 GArrowRecordBatchFileWriter *
 garrow_record_batch_file_writer_new_raw(
   std::shared_ptr<arrow::ipc::RecordBatchWriter> *arrow_writer);

From b83666234c05d34c23993708160033c259b9ec26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Tue, 27 Aug 2024 10:30:23 +0200
Subject: [PATCH 080/157] GH-43815: [CI][Packaging][Python] Avoid uploading
 wheel to gemfury if version already exists (#43816)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes are included in this PR?

Check whether version exists on gemfury before trying upload

### Are these changes tested?

Will be tested via archery

### Are there any user-facing changes?

No
* GitHub Issue: #43815

Lead-authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
---
 dev/tasks/macros.jinja | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja
index 6423ca0e9ef..df55f32222e 100644
--- a/dev/tasks/macros.jinja
+++ b/dev/tasks/macros.jinja
@@ -169,10 +169,14 @@ env:
   - name: Upload package to Gemfury
     shell: bash
     run: |
-      fury push \
-        --api-token=${CROSSBOW_GEMFURY_TOKEN} \
-        --as=${CROSSBOW_GEMFURY_ORG} \
-        {{ pattern }}
+      if $(fury versions --as=${CROSSBOW_GEMFURY_ORG} --api-token=${CROSSBOW_GEMFURY_TOKEN} pyarrow | grep --fixed-strings -q "{{ arrow.no_rc_version }}"); then
+        echo "Version {{ arrow.no_rc_version }} already exists. Avoid pushing version."
+      else
+        fury push \
+          --api-token=${CROSSBOW_GEMFURY_TOKEN} \
+          --as=${CROSSBOW_GEMFURY_ORG} \
+          {{ pattern }}
+      fi
     env:
       CROSSBOW_GEMFURY_TOKEN: {{ '${{ secrets.CROSSBOW_GEMFURY_TOKEN }}' }}
       CROSSBOW_GEMFURY_ORG: {{ '${{ secrets.CROSSBOW_GEMFURY_ORG }}' }}

From 6502f0e3ad046d361aba44385ab3379ed7af5b7f Mon Sep 17 00:00:00 2001
From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Date: Tue, 27 Aug 2024 13:17:39 -0400
Subject: [PATCH 081/157] GH-43790: [Go][Parquet] Add support for LZ4_RAW
 compression codec (#43835)

### Rationale for this change

Fixes: #43790

The LZ4 compression codec for Parquet is no longer ambiguous, as it has been superceded by the [LZ4_RAW](https://github.com/apache/parquet-format/blob/master/Compression.md#lz4_raw) spec.

### What changes are included in this PR?

- Add `LZ4Raw` compression codec
- Split out `StreamingCodec` methods from core `Codec` interface
- Various conformance/roundtrip tests
- Set of benchmarks for reading/writing an Arrow table to/from Parquet, using each compression codec

### Are these changes tested?

Yes

### Are there any user-facing changes?

- New codec `LZ4Raw` is available
- `Codec` interface no long provides the following methods, which are now part of `StreamingCodec`:
  - `NewReader`
  - `NewWriter`
  - `NewWriterLevel`

* GitHub Issue: #43790

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Joel Lubinitsky <joellubi@gmail.com>
---
 go/parquet/compress/compress.go          |  22 ++--
 go/parquet/compress/compress_test.go     |   8 +-
 go/parquet/compress/lz4_raw.go           |  66 ++++++++++++
 go/parquet/file/file_reader_test.go      | 127 +++++++++++++++++++++++
 go/parquet/file/file_writer_test.go      |  58 ++++++++++-
 go/parquet/pqarrow/reader_writer_test.go | 111 ++++++++++++++++++++
 6 files changed, 380 insertions(+), 12 deletions(-)
 create mode 100644 go/parquet/compress/lz4_raw.go

diff --git a/go/parquet/compress/compress.go b/go/parquet/compress/compress.go
index b6a1349133e..92f2ae99bb1 100644
--- a/go/parquet/compress/compress.go
+++ b/go/parquet/compress/compress.go
@@ -49,8 +49,9 @@ var Codecs = struct {
 	Brotli Compression
 	// LZ4 unsupported in this library due to problematic issues between the Hadoop LZ4 spec vs regular lz4
 	// see: http://mail-archives.apache.org/mod_mbox/arrow-dev/202007.mbox/%3CCAAri41v24xuA8MGHLDvgSnE+7AAgOhiEukemW_oPNHMvfMmrWw@mail.gmail.com%3E
-	Lz4  Compression
-	Zstd Compression
+	Lz4    Compression
+	Zstd   Compression
+	Lz4Raw Compression
 }{
 	Uncompressed: Compression(parquet.CompressionCodec_UNCOMPRESSED),
 	Snappy:       Compression(parquet.CompressionCodec_SNAPPY),
@@ -59,17 +60,12 @@ var Codecs = struct {
 	Brotli:       Compression(parquet.CompressionCodec_BROTLI),
 	Lz4:          Compression(parquet.CompressionCodec_LZ4),
 	Zstd:         Compression(parquet.CompressionCodec_ZSTD),
+	Lz4Raw:       Compression(parquet.CompressionCodec_LZ4_RAW),
 }
 
 // Codec is an interface which is implemented for each compression type in order to make the interactions easy to
 // implement. Most consumers won't be calling GetCodec directly.
 type Codec interface {
-	// NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data
-	NewReader(io.Reader) io.ReadCloser
-	// NewWriter provides a wrapper around a write stream to compress data before writing it.
-	NewWriter(io.Writer) io.WriteCloser
-	// NewWriterLevel is like NewWriter but allows specifying the compression level
-	NewWriterLevel(io.Writer, int) (io.WriteCloser, error)
 	// Encode encodes a block of data given by src and returns the compressed block. dst should be either nil
 	// or sized large enough to fit the compressed block (use CompressBound to allocate). dst and src should not
 	// overlap since some of the compression types don't allow it.
@@ -90,6 +86,16 @@ type Codec interface {
 	Decode(dst, src []byte) []byte
 }
 
+// StreamingCodec is an interface that may be implemented for compression codecs that expose a streaming API.
+type StreamingCodec interface {
+	// NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data
+	NewReader(io.Reader) io.ReadCloser
+	// NewWriter provides a wrapper around a write stream to compress data before writing it.
+	NewWriter(io.Writer) io.WriteCloser
+	// NewWriterLevel is like NewWriter but allows specifying the compression level
+	NewWriterLevel(io.Writer, int) (io.WriteCloser, error)
+}
+
 var codecs = map[Compression]Codec{}
 
 // RegisterCodec adds or overrides a codec implementation for a given compression algorithm.
diff --git a/go/parquet/compress/compress_test.go b/go/parquet/compress/compress_test.go
index 843062c0d02..5aac74759e1 100644
--- a/go/parquet/compress/compress_test.go
+++ b/go/parquet/compress/compress_test.go
@@ -66,8 +66,8 @@ func TestCompressDataOneShot(t *testing.T) {
 		{compress.Codecs.Gzip},
 		{compress.Codecs.Brotli},
 		{compress.Codecs.Zstd},
+		{compress.Codecs.Lz4Raw},
 		// {compress.Codecs.Lzo},
-		// {compress.Codecs.Lz4},
 	}
 
 	for _, tt := range tests {
@@ -107,9 +107,11 @@ func TestCompressReaderWriter(t *testing.T) {
 			var buf bytes.Buffer
 			codec, err := compress.GetCodec(tt.c)
 			assert.NoError(t, err)
+			streamingCodec, ok := codec.(compress.StreamingCodec)
+			assert.True(t, ok)
 			data := makeRandomData(RandomDataSize)
 
-			wr := codec.NewWriter(&buf)
+			wr := streamingCodec.NewWriter(&buf)
 
 			const chunkSize = 1111
 			input := data
@@ -129,7 +131,7 @@ func TestCompressReaderWriter(t *testing.T) {
 			}
 			wr.Close()
 
-			rdr := codec.NewReader(&buf)
+			rdr := streamingCodec.NewReader(&buf)
 			out, err := io.ReadAll(rdr)
 			assert.NoError(t, err)
 			assert.Exactly(t, data, out)
diff --git a/go/parquet/compress/lz4_raw.go b/go/parquet/compress/lz4_raw.go
new file mode 100644
index 00000000000..788d9520a66
--- /dev/null
+++ b/go/parquet/compress/lz4_raw.go
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compress
+
+import (
+	"sync"
+
+	"github.com/pierrec/lz4/v4"
+)
+
+// lz4.Compressor is not goroutine-safe, so we use a pool to amortize the cost
+// of allocating a new one for each call to Encode().
+var compressorPool = sync.Pool{New: func() interface{} { return new(lz4.Compressor) }}
+
+func compressBlock(src, dst []byte) (int, error) {
+	c := compressorPool.Get().(*lz4.Compressor)
+	defer compressorPool.Put(c)
+	return c.CompressBlock(src, dst)
+}
+
+type lz4RawCodec struct{}
+
+func (c lz4RawCodec) Encode(dst, src []byte) []byte {
+	n, err := compressBlock(src, dst[:cap(dst)])
+	if err != nil {
+		panic(err)
+	}
+
+	return dst[:n]
+}
+
+func (c lz4RawCodec) EncodeLevel(dst, src []byte, _ int) []byte {
+	// the lz4 block implementation does not allow level to be set
+	return c.Encode(dst, src)
+}
+
+func (lz4RawCodec) Decode(dst, src []byte) []byte {
+	n, err := lz4.UncompressBlock(src, dst)
+	if err != nil {
+		panic(err)
+	}
+
+	return dst[:n]
+}
+
+func (c lz4RawCodec) CompressBound(len int64) int64 {
+	return int64(lz4.CompressBlockBound(int(len)))
+}
+
+func init() {
+	RegisterCodec(Codecs.Lz4Raw, lz4RawCodec{})
+}
diff --git a/go/parquet/file/file_reader_test.go b/go/parquet/file/file_reader_test.go
index 547ec475c27..35f4da4e866 100644
--- a/go/parquet/file/file_reader_test.go
+++ b/go/parquet/file/file_reader_test.go
@@ -644,3 +644,130 @@ func TestDeltaBinaryPackedMultipleBatches(t *testing.T) {
 
 	require.Equalf(t, size, totalRows, "Expected %d rows, but got %d rows", size, totalRows)
 }
+
+// Test read file lz4_raw_compressed.parquet
+// Contents documented at https://github.com/apache/parquet-testing/commit/ddd898958803cb89b7156c6350584d1cda0fe8de
+func TestLZ4RawFileRead(t *testing.T) {
+	dir := os.Getenv("PARQUET_TEST_DATA")
+	if dir == "" {
+		t.Skip("no path supplied with PARQUET_TEST_DATA")
+	}
+	require.DirExists(t, dir)
+
+	props := parquet.NewReaderProperties(memory.DefaultAllocator)
+	fileReader, err := file.OpenParquetFile(path.Join(dir, "lz4_raw_compressed.parquet"),
+		false, file.WithReadProps(props))
+	require.NoError(t, err)
+	defer fileReader.Close()
+
+	nRows := 4
+	nCols := 3
+	require.Equal(t, 1, fileReader.NumRowGroups())
+	rgr := fileReader.RowGroup(0)
+	require.EqualValues(t, nRows, rgr.NumRows())
+	require.EqualValues(t, nCols, rgr.NumColumns())
+
+	rdr, err := rgr.Column(0)
+	require.NoError(t, err)
+
+	rowsInt64, ok := rdr.(*file.Int64ColumnChunkReader)
+	require.True(t, ok)
+
+	valsInt64 := make([]int64, nRows)
+	total, read, err := rowsInt64.ReadBatch(int64(nRows), valsInt64, nil, nil)
+	require.NoError(t, err)
+	require.Equal(t, int64(nRows), total)
+	require.Equal(t, nRows, read)
+
+	expectedValsInt64 := []int64{
+		1593604800,
+		1593604800,
+		1593604801,
+		1593604801,
+	}
+	require.Equal(t, expectedValsInt64, valsInt64)
+
+	rdr, err = rgr.Column(1)
+	require.NoError(t, err)
+
+	rowsByteArray, ok := rdr.(*file.ByteArrayColumnChunkReader)
+	require.True(t, ok)
+
+	valsByteArray := make([]parquet.ByteArray, nRows)
+	total, read, err = rowsByteArray.ReadBatch(int64(nRows), valsByteArray, nil, nil)
+	require.NoError(t, err)
+	require.Equal(t, int64(nRows), total)
+	require.Equal(t, nRows, read)
+
+	expectedValsByteArray := []parquet.ByteArray{
+		[]byte("abc"),
+		[]byte("def"),
+		[]byte("abc"),
+		[]byte("def"),
+	}
+	require.Equal(t, expectedValsByteArray, valsByteArray)
+
+	rdr, err = rgr.Column(2)
+	require.NoError(t, err)
+
+	rowsFloat64, ok := rdr.(*file.Float64ColumnChunkReader)
+	require.True(t, ok)
+
+	valsFloat64 := make([]float64, nRows)
+	total, read, err = rowsFloat64.ReadBatch(int64(nRows), valsFloat64, nil, nil)
+	require.NoError(t, err)
+	require.Equal(t, int64(nRows), total)
+	require.Equal(t, nRows, read)
+
+	expectedValsFloat64 := []float64{
+		42.0,
+		7.7,
+		42.125,
+		7.7,
+	}
+	require.Equal(t, expectedValsFloat64, valsFloat64)
+}
+
+// Test read file lz4_raw_compressed_larger.parquet
+// Contents documented at https://github.com/apache/parquet-testing/commit/ddd898958803cb89b7156c6350584d1cda0fe8de
+func TestLZ4RawLargerFileRead(t *testing.T) {
+	dir := os.Getenv("PARQUET_TEST_DATA")
+	if dir == "" {
+		t.Skip("no path supplied with PARQUET_TEST_DATA")
+	}
+	require.DirExists(t, dir)
+
+	props := parquet.NewReaderProperties(memory.DefaultAllocator)
+	fileReader, err := file.OpenParquetFile(path.Join(dir, "lz4_raw_compressed_larger.parquet"),
+		false, file.WithReadProps(props))
+	require.NoError(t, err)
+	defer fileReader.Close()
+
+	nRows := 10000
+	nCols := 1
+	require.Equal(t, 1, fileReader.NumRowGroups())
+	rgr := fileReader.RowGroup(0)
+	require.EqualValues(t, nRows, rgr.NumRows())
+	require.EqualValues(t, nCols, rgr.NumColumns())
+
+	rdr, err := rgr.Column(0)
+	require.NoError(t, err)
+
+	rows, ok := rdr.(*file.ByteArrayColumnChunkReader)
+	require.True(t, ok)
+
+	vals := make([]parquet.ByteArray, nRows)
+	total, read, err := rows.ReadBatch(int64(nRows), vals, nil, nil)
+	require.NoError(t, err)
+	require.Equal(t, int64(nRows), total)
+	require.Equal(t, nRows, read)
+
+	expectedValsHead := []parquet.ByteArray{
+		[]byte("c7ce6bef-d5b0-4863-b199-8ea8c7fb117b"),
+		[]byte("e8fb9197-cb9f-4118-b67f-fbfa65f61843"),
+		[]byte("885136e1-0aa1-4fdb-8847-63d87b07c205"),
+		[]byte("ce7b2019-8ebe-4906-a74d-0afa2409e5df"),
+		[]byte("a9ee2527-821b-4b71-a926-03f73c3fc8b7"),
+	}
+	require.Equal(t, expectedValsHead, vals[:len(expectedValsHead)])
+}
diff --git a/go/parquet/file/file_writer_test.go b/go/parquet/file/file_writer_test.go
index 0faf3f7233b..12ac93d1ef4 100644
--- a/go/parquet/file/file_writer_test.go
+++ b/go/parquet/file/file_writer_test.go
@@ -260,7 +260,7 @@ func (t *SerializeTestSuite) TestSmallFile() {
 		compress.Codecs.Brotli,
 		compress.Codecs.Gzip,
 		compress.Codecs.Zstd,
-		// compress.Codecs.Lz4,
+		compress.Codecs.Lz4Raw,
 		// compress.Codecs.Lzo,
 	}
 	for _, c := range codecs {
@@ -540,3 +540,59 @@ func TestBatchedByteStreamSplitFileRoundtrip(t *testing.T) {
 
 	require.NoError(t, rdr.Close())
 }
+
+func TestLZ4RawFileRoundtrip(t *testing.T) {
+	input := []int64{
+		-1, 0, 1, 2, 3, 4, 5, 123456789, -123456789,
+	}
+
+	size := len(input)
+
+	field, err := schema.NewPrimitiveNodeLogical("int64", parquet.Repetitions.Required, nil, parquet.Types.Int64, 0, 1)
+	require.NoError(t, err)
+
+	schema, err := schema.NewGroupNode("test", parquet.Repetitions.Required, schema.FieldList{field}, 0)
+	require.NoError(t, err)
+
+	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
+	writer := file.NewParquetWriter(sink, schema, file.WithWriterProps(parquet.NewWriterProperties(parquet.WithCompression(compress.Codecs.Lz4Raw))))
+
+	rgw := writer.AppendRowGroup()
+	cw, err := rgw.NextColumn()
+	require.NoError(t, err)
+
+	i64ColumnWriter, ok := cw.(*file.Int64ColumnChunkWriter)
+	require.True(t, ok)
+
+	nVals, err := i64ColumnWriter.WriteBatch(input, nil, nil)
+	require.NoError(t, err)
+	require.EqualValues(t, size, nVals)
+
+	require.NoError(t, cw.Close())
+	require.NoError(t, rgw.Close())
+	require.NoError(t, writer.Close())
+
+	rdr, err := file.NewParquetReader(bytes.NewReader(sink.Bytes()))
+	require.NoError(t, err)
+
+	require.Equal(t, 1, rdr.NumRowGroups())
+	require.EqualValues(t, size, rdr.NumRows())
+
+	rgr := rdr.RowGroup(0)
+	cr, err := rgr.Column(0)
+	require.NoError(t, err)
+
+	i64ColumnReader, ok := cr.(*file.Int64ColumnChunkReader)
+	require.True(t, ok)
+
+	output := make([]int64, size)
+
+	total, valuesRead, err := i64ColumnReader.ReadBatch(int64(size), output, nil, nil)
+	require.NoError(t, err)
+	require.EqualValues(t, size, total)
+	require.EqualValues(t, size, valuesRead)
+
+	require.Equal(t, input, output)
+
+	require.NoError(t, rdr.Close())
+}
diff --git a/go/parquet/pqarrow/reader_writer_test.go b/go/parquet/pqarrow/reader_writer_test.go
index 31bd0eba843..e020c7d9457 100644
--- a/go/parquet/pqarrow/reader_writer_test.go
+++ b/go/parquet/pqarrow/reader_writer_test.go
@@ -19,6 +19,8 @@ package pqarrow_test
 import (
 	"bytes"
 	"context"
+	"fmt"
+	"math"
 	"testing"
 	"unsafe"
 
@@ -26,8 +28,10 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/array"
 	"github.com/apache/arrow/go/v18/arrow/memory"
 	"github.com/apache/arrow/go/v18/parquet"
+	"github.com/apache/arrow/go/v18/parquet/compress"
 	"github.com/apache/arrow/go/v18/parquet/file"
 	"github.com/apache/arrow/go/v18/parquet/pqarrow"
+	"github.com/stretchr/testify/require"
 	"golang.org/x/exp/rand"
 	"gonum.org/v1/gonum/stat/distuv"
 )
@@ -275,3 +279,110 @@ func BenchmarkReadColumnFloat64(b *testing.B) {
 		benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN)))
 	}
 }
+
+var compressTestCases = []struct {
+	c compress.Compression
+}{
+	{compress.Codecs.Uncompressed},
+	{compress.Codecs.Snappy},
+	{compress.Codecs.Gzip},
+	{compress.Codecs.Brotli},
+	{compress.Codecs.Zstd},
+	{compress.Codecs.Lz4Raw},
+	// {compress.Codecs.Lzo},
+}
+
+func buildTableForTest(mem memory.Allocator) arrow.Table {
+	schema := arrow.NewSchema(
+		[]arrow.Field{
+			{Name: "int64s", Type: arrow.PrimitiveTypes.Int64},
+			{Name: "strings", Type: arrow.BinaryTypes.String},
+			{Name: "bools", Type: arrow.FixedWidthTypes.Boolean},
+			{Name: "repeated_int64s", Type: arrow.PrimitiveTypes.Int64},
+			{Name: "repeated_strings", Type: arrow.BinaryTypes.String},
+			{Name: "repeated_bools", Type: arrow.FixedWidthTypes.Boolean},
+		},
+		nil,
+	)
+	bldr := array.NewRecordBuilder(mem, schema)
+	defer bldr.Release()
+
+	for i := 0; i < SIZELEN; i++ {
+		bldr.Field(0).(*array.Int64Builder).Append(int64(i))
+		bldr.Field(1).(*array.StringBuilder).Append(fmt.Sprint(i))
+		bldr.Field(2).(*array.BooleanBuilder).Append(i%2 == 0)
+		bldr.Field(3).(*array.Int64Builder).Append(0)
+		bldr.Field(4).(*array.StringBuilder).Append("the string is the same")
+		bldr.Field(5).(*array.BooleanBuilder).Append(true)
+	}
+
+	rec := bldr.NewRecord()
+	return array.NewTableFromRecords(schema, []arrow.Record{rec})
+}
+
+func BenchmarkWriteTableCompressed(b *testing.B) {
+	mem := memory.DefaultAllocator
+	table := buildTableForTest(mem)
+	defer table.Release()
+
+	var uncompressedSize uint64
+	for idxCol := 0; int64(idxCol) < table.NumCols(); idxCol++ {
+		column := table.Column(idxCol)
+		for _, chunk := range column.Data().Chunks() {
+			uncompressedSize += chunk.Data().SizeInBytes()
+		}
+	}
+
+	var buf bytes.Buffer
+	buf.Grow(int(uncompressedSize))
+	for _, tc := range compressTestCases {
+		b.Run(fmt.Sprintf("codec=%s", tc.c), func(b *testing.B) {
+			buf.Reset()
+			b.ResetTimer()
+			b.SetBytes(int64(uncompressedSize))
+			for n := 0; n < b.N; n++ {
+				require.NoError(b,
+					pqarrow.WriteTable(
+						table,
+						&buf,
+						math.MaxInt64,
+						parquet.NewWriterProperties(parquet.WithAllocator(mem), parquet.WithCompression(tc.c)),
+						pqarrow.DefaultWriterProps(),
+					),
+				)
+			}
+		})
+	}
+}
+
+func BenchmarkReadTableCompressed(b *testing.B) {
+	ctx := context.Background()
+	mem := memory.DefaultAllocator
+	table := buildTableForTest(mem)
+	defer table.Release()
+
+	for _, tc := range compressTestCases {
+		b.Run(fmt.Sprintf("codec=%s", tc.c), func(b *testing.B) {
+			var buf bytes.Buffer
+			err := pqarrow.WriteTable(
+				table,
+				&buf,
+				math.MaxInt64,
+				parquet.NewWriterProperties(parquet.WithAllocator(mem), parquet.WithCompression(tc.c)),
+				pqarrow.DefaultWriterProps(),
+			)
+			require.NoError(b, err)
+
+			compressedBytes := buf.Len()
+			rdr := bytes.NewReader(buf.Bytes())
+
+			b.ResetTimer()
+			b.SetBytes(int64(compressedBytes))
+			for n := 0; n < b.N; n++ {
+				tab, err := pqarrow.ReadTable(ctx, rdr, nil, pqarrow.ArrowReadProperties{}, mem)
+				require.NoError(b, err)
+				defer tab.Release()
+			}
+		})
+	}
+}

From ce1e724d7ea292746ede6a538519658f1ecab849 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Tue, 27 Aug 2024 19:17:55 +0200
Subject: [PATCH 082/157] MINOR: [CI] Use `docker compose` on self-hosted ARM
 builds (#43844)

### Rationale for this change

The Docker client version on the ARM64 self-hosted runners is now recent enough, so we don't need to use `docker-compose` there anymore.

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .github/workflows/cpp.yml                 | 5 +----
 .github/workflows/go.yml                  | 5 -----
 dev/tasks/java-jars/github.yml            | 2 --
 dev/tasks/linux-packages/github.linux.yml | 1 -
 dev/tasks/python-wheels/github.linux.yml  | 1 -
 5 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index a82e1eb7666..c5482f73082 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -99,7 +99,6 @@ jobs:
             cat <<JSON >> "$GITHUB_OUTPUT"
           {
             "arch": "arm64v8",
-            "archery-use-legacy-docker-compose": "1",
             "clang-tools": "10",
             "image": "ubuntu-cpp",
             "llvm": "10",
@@ -124,9 +123,6 @@ jobs:
         include: ${{ fromJson(needs.docker-targets.outputs.targets) }}
     env:
       ARCH: ${{ matrix.arch }}
-      # By default, use `docker compose` because docker-compose v1 is obsolete,
-      # except where the Docker client version is too old.
-      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: ${{ matrix.archery-use-legacy-docker-compose || '0' }}
       ARROW_SIMD_LEVEL: ${{ matrix.simd-level }}
       CLANG_TOOLS: ${{ matrix.clang-tools }}
       LLVM: ${{ matrix.llvm }}
@@ -147,6 +143,7 @@ jobs:
         run: |
           sudo apt update
           sudo apt install -y --no-install-recommends python3 python3-dev python3-pip
+          python3 -m pip install -U pip
       - name: Setup Archery
         run: python3 -m pip install -e dev/archery[docker]
       - name: Execute Docker Build
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 20c78d86cb2..ffd543691d5 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -78,14 +78,12 @@ jobs:
           {
             "arch-label": "ARM64",
             "arch": "arm64v8",
-            "archery-use-legacy-docker-compose": "1",
             "go": "1.21",
             "runs-on": ["self-hosted", "arm", "linux"]
           },
           {
             "arch-label": "ARM64",
             "arch": "arm64v8",
-            "archery-use-legacy-docker-compose": "1",
             "go": "1.22",
             "runs-on": ["self-hosted", "arm", "linux"]
           }
@@ -106,9 +104,6 @@ jobs:
         include: ${{ fromJson(needs.docker-targets.outputs.targets) }}
     env:
       ARCH: ${{ matrix.arch }}
-      # By default, use Docker CLI because docker-compose v1 is obsolete,
-      # except where the Docker client version is too old.
-      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: ${{ matrix.archery-use-legacy-docker-compose || '0' }}
       GO: ${{ matrix.go }}
     steps:
       - name: Checkout Arrow
diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml
index 7cbd5f05dab..bdbed1bd678 100644
--- a/dev/tasks/java-jars/github.yml
+++ b/dev/tasks/java-jars/github.yml
@@ -30,7 +30,6 @@ jobs:
       ARCH: {{ '${{ matrix.platform.archery_arch }}' }}
       ARCH_ALIAS: {{ '${{ matrix.platform.archery_arch_alias }}' }}
       ARCH_SHORT: {{ '${{ matrix.platform.archery_arch_short }}' }}
-      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: {{ "${{matrix.platform.archery_use_legacy_docker_compose || '0'}}" }}
     strategy:
       fail-fast: false
       matrix:
@@ -45,7 +44,6 @@ jobs:
             archery_arch: "arm64v8"
             archery_arch_alias: "aarch64"
             archery_arch_short: "arm64"
-            archery_use_legacy_docker_compose: "1"
     steps:
       {{ macros.github_checkout_arrow()|indent }}
       {{ macros.github_free_space()|indent }}
diff --git a/dev/tasks/linux-packages/github.linux.yml b/dev/tasks/linux-packages/github.linux.yml
index 4bf2295ef3e..cce976cd60e 100644
--- a/dev/tasks/linux-packages/github.linux.yml
+++ b/dev/tasks/linux-packages/github.linux.yml
@@ -29,7 +29,6 @@ jobs:
     {% endif %}
     env:
       ARCHITECTURE: {{ architecture }}
-      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: {{ '1' if architecture == 'arm64' else '0' }}
     steps:
       {{ macros.github_checkout_arrow()|indent }}
       {{ macros.github_login_dockerhub()|indent }}
diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml
index 2854d4349fb..97746ba3f9b 100644
--- a/dev/tasks/python-wheels/github.linux.yml
+++ b/dev/tasks/python-wheels/github.linux.yml
@@ -33,7 +33,6 @@ jobs:
       ARCH: amd64
       {% else %}
       ARCH: arm64v8
-      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: 1
       {% endif %}
       PYTHON: "{{ python_version }}"
       {% if python_version == "3.13" %}

From 75ca5b3631144f58ea3edbe6b4933a686c0e0fd9 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 28 Aug 2024 05:47:43 +0900
Subject: [PATCH 083/157] GH-43805: [C++] Enable filesystem automatically when
 one of ARROW_{AZURE,GCS,HDFS,S3}=ON is specified (#43806)

### Rationale for this change

`ARROW_{AZURE,GCS,HDFS,S3}=ON` are meaningful only when filesystem is enabled. If the user specified one of them, we can assume that the user wants to enable filesystem.

### What changes are included in this PR?

Enable `ARROW_FILESYSTEM` when one of `ARROW_{AZURE,GCS,HDFS,S3}=ON` are specified.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.

`ARROW_FILESYSTEM` is enabled automatically with one of `ARROW_{AZURE,GCS,HDFS,S3}=ON`.
* GitHub Issue: #43805

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/cmake_modules/DefineOptions.cmake | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake
index 41466a1c224..755887314d1 100644
--- a/cpp/cmake_modules/DefineOptions.cmake
+++ b/cpp/cmake_modules/DefineOptions.cmake
@@ -303,7 +303,10 @@ takes precedence over ccache if a storage backend is configured" ON)
                 ARROW_IPC)
 
   define_option(ARROW_AZURE
-                "Build Arrow with Azure support (requires the Azure SDK for C++)" OFF)
+                "Build Arrow with Azure support (requires the Azure SDK for C++)"
+                OFF
+                DEPENDS
+                ARROW_FILESYSTEM)
 
   define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" OFF)
 
@@ -346,9 +349,16 @@ takes precedence over ccache if a storage backend is configured" ON)
                 ARROW_WITH_UTF8PROC)
 
   define_option(ARROW_GCS
-                "Build Arrow with GCS support (requires the GCloud SDK for C++)" OFF)
+                "Build Arrow with GCS support (requires the GCloud SDK for C++)"
+                OFF
+                DEPENDS
+                ARROW_FILESYSTEM)
 
-  define_option(ARROW_HDFS "Build the Arrow HDFS bridge" OFF)
+  define_option(ARROW_HDFS
+                "Build the Arrow HDFS bridge"
+                OFF
+                DEPENDS
+                ARROW_FILESYSTEM)
 
   define_option(ARROW_IPC "Build the Arrow IPC extensions" ON)
 
@@ -398,7 +408,11 @@ takes precedence over ccache if a storage backend is configured" ON)
                 ARROW_HDFS
                 ARROW_JSON)
 
-  define_option(ARROW_S3 "Build Arrow with S3 support (requires the AWS SDK for C++)" OFF)
+  define_option(ARROW_S3
+                "Build Arrow with S3 support (requires the AWS SDK for C++)"
+                OFF
+                DEPENDS
+                ARROW_FILESYSTEM)
 
   define_option(ARROW_SKYHOOK
                 "Build the Skyhook libraries"

From 09bb24a5cdf5b6e73334e9a8b521f0188d940c73 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Wed, 28 Aug 2024 06:13:31 +0530
Subject: [PATCH 084/157] MINOR: [Java] Logback dependency upgrade (#43842)

### Rationale for this change

Fusing https://github.com/apache/arrow/pull/43752 and https://github.com/apache/arrow/pull/43827 dependabot PRs into a single PR.

### What changes are included in this PR?

Keeping a single version for both `logback-classic` and `logback-core`.

### Are these changes tested?

N/A

### Are there any user-facing changes?

No

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/memory/memory-netty/pom.xml |  1 -
 java/pom.xml                     | 13 ++++++++++++-
 java/tools/pom.xml               |  1 -
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/java/memory/memory-netty/pom.xml b/java/memory/memory-netty/pom.xml
index f2d4d2d0fe3..6cf573dd4d3 100644
--- a/java/memory/memory-netty/pom.xml
+++ b/java/memory/memory-netty/pom.xml
@@ -56,7 +56,6 @@ under the License.
     <dependency>
       <groupId>ch.qos.logback</groupId>
       <artifactId>logback-core</artifactId>
-      <version>1.3.14</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/java/pom.xml b/java/pom.xml
index f78d02c0c65..577f23e6a71 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -111,6 +111,7 @@ under the License.
     <mockito.core.version>5.11.0</mockito.core.version>
     <mockito.inline.version>5.2.0</mockito.inline.version>
     <checker.framework.version>3.46.0</checker.framework.version>
+    <logback.version>1.5.7</logback.version>
     <doclint>none</doclint>
     <additionalparam>-Xdoclint:none</additionalparam>
     <!-- List of add-opens arg line arguments for tests -->
@@ -221,6 +222,16 @@ under the License.
         <type>pom</type>
         <scope>import</scope>
       </dependency>
+      <dependency>
+        <groupId>ch.qos.logback</groupId>
+        <artifactId>logback-classic</artifactId>
+        <version>${logback.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>ch.qos.logback</groupId>
+        <artifactId>logback-core</artifactId>
+        <version>${logback.version}</version>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 
@@ -274,7 +285,7 @@ under the License.
     <dependency>
       <groupId>ch.qos.logback</groupId>
       <artifactId>logback-classic</artifactId>
-      <version>1.4.14</version>
+      <version>${logback.version}</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/java/tools/pom.xml b/java/tools/pom.xml
index 94566495dff..082f06860c6 100644
--- a/java/tools/pom.xml
+++ b/java/tools/pom.xml
@@ -59,7 +59,6 @@ under the License.
     <dependency>
       <groupId>ch.qos.logback</groupId>
       <artifactId>logback-classic</artifactId>
-      <version>1.4.14</version>
       <scope>test</scope>
     </dependency>
     <!--

From 9c801bbb9de55591ec026719c45180be0363f7e6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 28 Aug 2024 09:50:20 +0900
Subject: [PATCH 085/157] MINOR: [Java] Bump commons-cli:commons-cli from 1.8.0
 to 1.9.0 in /java (#43825)

Bumps commons-cli:commons-cli from 1.8.0 to 1.9.0.

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=commons-cli:commons-cli&package-manager=maven&previous-version=1.8.0&new-version=1.9.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/flight/flight-integration-tests/pom.xml | 2 +-
 java/flight/flight-sql/pom.xml               | 2 +-
 java/tools/pom.xml                           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/java/flight/flight-integration-tests/pom.xml b/java/flight/flight-integration-tests/pom.xml
index a154062ba81..7da5156404d 100644
--- a/java/flight/flight-integration-tests/pom.xml
+++ b/java/flight/flight-integration-tests/pom.xml
@@ -58,7 +58,7 @@ under the License.
     <dependency>
       <groupId>commons-cli</groupId>
       <artifactId>commons-cli</artifactId>
-      <version>1.8.0</version>
+      <version>1.9.0</version>
     </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>
diff --git a/java/flight/flight-sql/pom.xml b/java/flight/flight-sql/pom.xml
index c9c589d202a..92bab5e2067 100644
--- a/java/flight/flight-sql/pom.xml
+++ b/java/flight/flight-sql/pom.xml
@@ -118,7 +118,7 @@ under the License.
     <dependency>
       <groupId>commons-cli</groupId>
       <artifactId>commons-cli</artifactId>
-      <version>1.8.0</version>
+      <version>1.9.0</version>
       <optional>true</optional>
     </dependency>
   </dependencies>
diff --git a/java/tools/pom.xml b/java/tools/pom.xml
index 082f06860c6..d261496040b 100644
--- a/java/tools/pom.xml
+++ b/java/tools/pom.xml
@@ -54,7 +54,7 @@ under the License.
     <dependency>
       <groupId>commons-cli</groupId>
       <artifactId>commons-cli</artifactId>
-      <version>1.8.0</version>
+      <version>1.9.0</version>
     </dependency>
     <dependency>
       <groupId>ch.qos.logback</groupId>

From 6b268f62a8a172249ef35f093009c740c32e1f36 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 28 Aug 2024 09:50:36 +0900
Subject: [PATCH 086/157] MINOR: [Java] Bump
 com.google.api.grpc:proto-google-common-protos from 2.42.0 to 2.43.0 in /java
 (#43824)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [com.google.api.grpc:proto-google-common-protos](https://github.com/googleapis/sdk-platform-java) from 2.42.0 to 2.43.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/googleapis/sdk-platform-java/releases">com.google.api.grpc:proto-google-common-protos's releases</a>.</em></p>
<blockquote>
<h2>v2.43.0</h2>
<h2><a href="https://github.com/googleapis/sdk-platform-java/compare/v2.42.0...v2.43.0">2.43.0</a> (2024-07-25)</h2>
<h3>Features</h3>
<ul>
<li>add <code>transport</code> option to <code>generation_config.yaml</code> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3052">#3052</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/3b1a91551ab6bbaf6a46950e1677c15cdd70d2e9">3b1a915</a>)</li>
<li>get released version from versions.txt to render <code>README.md</code> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3007">#3007</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/99bb2b339eadd480dcc1753d4ba3aeda3b5c64de">99bb2b3</a>)</li>
<li>Introduce java.time to Gax-Java (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/1872">#1872</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/308aeafc9f04795d2e1df8206c84689b11c4323a">308aeaf</a>)</li>
<li>Mark <code>getDefaultEndpoint()</code> with <a href="https://github.com/ObsoleteApi"><code>@​ObsoleteApi</code></a> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2347">#2347</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/e46648f112a149f967783539d30b4c44474b39fe">e46648f</a>)</li>
<li>parse <code>BUILD.bzel</code> to determine whether a commit that only changed <code>BUILD.bazel</code> is a qualified commit (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2937">#2937</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/502f80101dec191befb660a1aba6d0c354758c18">502f801</a>)</li>
</ul>
<h3>Bug Fixes</h3>
<ul>
<li>Fix:  (<a href="https://github.com/googleapis/sdk-platform-java/commit/d996c2dfb4b1cb115e0a2cd117eebd8a4ab41cad">d996c2d</a>)</li>
<li><code>BaseApiTracer</code> to noop on attemptFailed via overloaded method call (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3016">#3016</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/2fc938a819f4a2da9cfd25d2d306b62f53fa1f91">2fc938a</a>)</li>
<li>Generator to skip generation for empty services. (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3051">#3051</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/ff2c48543940bb0ceb78392b0f5af67568823002">ff2c485</a>)</li>
<li>restore hermetic build image publication (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2952">#2952</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/97a6d678569b7d8768ff83fe5370d8966a06ca95">97a6d67</a>)</li>
</ul>
<h3>Dependencies</h3>
<ul>
<li>update dependency com.fasterxml.jackson:jackson-bom to v2.17.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3028">#3028</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/d16f9d114a75fb8a77dfc39edf6fe2aa2f967704">d16f9d1</a>)</li>
<li>update dependency com.google.cloud.opentelemetry:detector-resources-support to v0.30.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2975">#2975</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/b3ec93f1925ff5a92b47200a61303e5561dbb1b8">b3ec93f</a>)</li>
<li>update dependency com.google.cloud.opentelemetry:detector-resources-support to v0.31.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3044">#3044</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/6bd07dc9fb589c72cf7b86bb2e0137687e1f61f2">6bd07dc</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.29.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3058">#3058</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/8ea0868e9e67a4c58075b98de0cf7b51635ea2f8">8ea0868</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.29.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3059">#3059</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/81b23dc88eeff492f6cef6328ce3b5d32992f500">81b23dc</a>)</li>
<li>update dependency com.google.guava:guava to v33.2.1-jre (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3027">#3027</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/12ee456727d0cd9d86aeadd65e633b5d7abb3d50">12ee456</a>)</li>
<li>update dependency commons-codec:commons-codec to v1.17.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3049">#3049</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/58d94b795db06fa76099c871501d2a1f7465633b">58d94b7</a>)</li>
<li>update dependency dev.cel:cel to v0.6.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3050">#3050</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/bc332d95919c0a1909e43f4ab7c7fe4db406697e">bc332d9</a>)</li>
<li>update dependency net.bytebuddy:byte-buddy to v1.14.18 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3029">#3029</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/8799cf602a3204a4adeaf4f48000979e49107959">8799cf6</a>)</li>
<li>update dependency org.apache.commons:commons-lang3 to v3.15.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3060">#3060</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/2538334aff96a4ad70a26bac2141d3235856b1a1">2538334</a>)</li>
<li>update dependency org.checkerframework:checker-qual to v3.45.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2988">#2988</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/4edd216163662008ee1060b6eb82ca673045826f">4edd216</a>)</li>
<li>update google api dependencies (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2951">#2951</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c16f6c95636b4997861ef3914b06f7819a8bd69a">c16f6c9</a>)</li>
<li>update google auth library dependencies to v1.24.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3039">#3039</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/98b5bd7d2ddb98c7e52bffd0b93c5661a1c9d39b">98b5bd7</a>)</li>
<li>update googleapis/java-cloud-bom digest to 47c5dbc (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2974">#2974</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/57623f08441969e0ff0170a72779fb8425ff6592">57623f0</a>)</li>
<li>update grpc dependencies to v1.65.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3061">#3061</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/27497e215cda4e8ad17fce2faa794b600edfc4cd">27497e2</a>)</li>
<li>update junit5 monorepo to v5.10.3 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2963">#2963</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/bc55fe1fe55876ee3b4843cefb05ee401c323865">bc55fe1</a>)</li>
<li>update netty dependencies to v4.1.112.final (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3057">#3057</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/5af127be3d9dadcdf0d9a5519ce6ad3b2e3bb481">5af127b</a>)</li>
<li>update opentelemetry-java monorepo to v1.40.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3035">#3035</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/5c31c4211993f25d2c352ef8f3e085187bc5fd30">5c31c42</a>)</li>
<li>Use Gapic-Showcase v0.35.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3018">#3018</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/43773f0cf2418051b2c0e6245100973b8ce2152e">43773f0</a>)</li>
</ul>
<h3>Documentation</h3>
<ul>
<li>add support option to 'new issue' choices (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3055">#3055</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/6a2a17d1d84da9d45a4be6675ea6ca0235b42c99">6a2a17d</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/googleapis/sdk-platform-java/blob/main/CHANGELOG.md">com.google.api.grpc:proto-google-common-protos's changelog</a>.</em></p>
<blockquote>
<h2><a href="https://github.com/googleapis/sdk-platform-java/compare/v2.42.0...v2.43.0">2.43.0</a> (2024-07-25)</h2>
<h3>Features</h3>
<ul>
<li>add <code>transport</code> option to <code>generation_config.yaml</code> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3052">#3052</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/3b1a91551ab6bbaf6a46950e1677c15cdd70d2e9">3b1a915</a>)</li>
<li>get released version from versions.txt to render <code>README.md</code> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3007">#3007</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/99bb2b339eadd480dcc1753d4ba3aeda3b5c64de">99bb2b3</a>)</li>
<li>Introduce java.time to Gax-Java (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/1872">#1872</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/308aeafc9f04795d2e1df8206c84689b11c4323a">308aeaf</a>)</li>
<li>Mark <code>getDefaultEndpoint()</code> with <a href="https://github.com/ObsoleteApi"><code>@​ObsoleteApi</code></a> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2347">#2347</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/e46648f112a149f967783539d30b4c44474b39fe">e46648f</a>)</li>
<li>parse <code>BUILD.bzel</code> to determine whether a commit that only changed <code>BUILD.bazel</code> is a qualified commit (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2937">#2937</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/502f80101dec191befb660a1aba6d0c354758c18">502f801</a>)</li>
</ul>
<h3>Bug Fixes</h3>
<ul>
<li>Fix:  (<a href="https://github.com/googleapis/sdk-platform-java/commit/d996c2dfb4b1cb115e0a2cd117eebd8a4ab41cad">d996c2d</a>)</li>
<li><code>BaseApiTracer</code> to noop on attemptFailed via overloaded method call (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3016">#3016</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/2fc938a819f4a2da9cfd25d2d306b62f53fa1f91">2fc938a</a>)</li>
<li>Generator to skip generation for empty services. (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3051">#3051</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/ff2c48543940bb0ceb78392b0f5af67568823002">ff2c485</a>)</li>
<li>restore hermetic build image publication (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2952">#2952</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/97a6d678569b7d8768ff83fe5370d8966a06ca95">97a6d67</a>)</li>
</ul>
<h3>Dependencies</h3>
<ul>
<li>update dependency com.fasterxml.jackson:jackson-bom to v2.17.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3028">#3028</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/d16f9d114a75fb8a77dfc39edf6fe2aa2f967704">d16f9d1</a>)</li>
<li>update dependency com.google.cloud.opentelemetry:detector-resources-support to v0.30.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2975">#2975</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/b3ec93f1925ff5a92b47200a61303e5561dbb1b8">b3ec93f</a>)</li>
<li>update dependency com.google.cloud.opentelemetry:detector-resources-support to v0.31.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3044">#3044</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/6bd07dc9fb589c72cf7b86bb2e0137687e1f61f2">6bd07dc</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.29.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3058">#3058</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/8ea0868e9e67a4c58075b98de0cf7b51635ea2f8">8ea0868</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.29.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3059">#3059</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/81b23dc88eeff492f6cef6328ce3b5d32992f500">81b23dc</a>)</li>
<li>update dependency com.google.guava:guava to v33.2.1-jre (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3027">#3027</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/12ee456727d0cd9d86aeadd65e633b5d7abb3d50">12ee456</a>)</li>
<li>update dependency commons-codec:commons-codec to v1.17.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3049">#3049</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/58d94b795db06fa76099c871501d2a1f7465633b">58d94b7</a>)</li>
<li>update dependency dev.cel:cel to v0.6.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3050">#3050</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/bc332d95919c0a1909e43f4ab7c7fe4db406697e">bc332d9</a>)</li>
<li>update dependency net.bytebuddy:byte-buddy to v1.14.18 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3029">#3029</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/8799cf602a3204a4adeaf4f48000979e49107959">8799cf6</a>)</li>
<li>update dependency org.apache.commons:commons-lang3 to v3.15.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3060">#3060</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/2538334aff96a4ad70a26bac2141d3235856b1a1">2538334</a>)</li>
<li>update dependency org.checkerframework:checker-qual to v3.45.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2988">#2988</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/4edd216163662008ee1060b6eb82ca673045826f">4edd216</a>)</li>
<li>update google api dependencies (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2951">#2951</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c16f6c95636b4997861ef3914b06f7819a8bd69a">c16f6c9</a>)</li>
<li>update google auth library dependencies to v1.24.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3039">#3039</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/98b5bd7d2ddb98c7e52bffd0b93c5661a1c9d39b">98b5bd7</a>)</li>
<li>update googleapis/java-cloud-bom digest to 47c5dbc (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2974">#2974</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/57623f08441969e0ff0170a72779fb8425ff6592">57623f0</a>)</li>
<li>update grpc dependencies to v1.65.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3061">#3061</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/27497e215cda4e8ad17fce2faa794b600edfc4cd">27497e2</a>)</li>
<li>update junit5 monorepo to v5.10.3 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2963">#2963</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/bc55fe1fe55876ee3b4843cefb05ee401c323865">bc55fe1</a>)</li>
<li>update netty dependencies to v4.1.112.final (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3057">#3057</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/5af127be3d9dadcdf0d9a5519ce6ad3b2e3bb481">5af127b</a>)</li>
<li>update opentelemetry-java monorepo to v1.40.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3035">#3035</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/5c31c4211993f25d2c352ef8f3e085187bc5fd30">5c31c42</a>)</li>
<li>Use Gapic-Showcase v0.35.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3018">#3018</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/43773f0cf2418051b2c0e6245100973b8ce2152e">43773f0</a>)</li>
</ul>
<h3>Documentation</h3>
<ul>
<li>add support option to 'new issue' choices (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3055">#3055</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/6a2a17d1d84da9d45a4be6675ea6ca0235b42c99">6a2a17d</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/5e6da42ecfc5818d53a3053614a71680b482484f"><code>5e6da42</code></a> chore(main): release 2.43.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2953">#2953</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/10f950edf9f5ddc293d102bb46bae7aecdae6b98"><code>10f950e</code></a> chore: make generator version an optional param (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3040">#3040</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/43de5b568dc0bbdaddf2419652e0cb16de77ddb6"><code>43de5b5</code></a> build(deps): update dependency com.google.cloud:google-cloud-shared-config to...</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/941f08c6e58cf1662ffce195cd7f351992ae9806"><code>941f08c</code></a> chore: Add OpenTelemetry semantic conventions to shared dependencies (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3020">#3020</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/27497e215cda4e8ad17fce2faa794b600edfc4cd"><code>27497e2</code></a> deps: update grpc dependencies to v1.65.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3061">#3061</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/ff2c48543940bb0ceb78392b0f5af67568823002"><code>ff2c485</code></a> fix: Generator to skip generation for empty services. (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3051">#3051</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/5af127be3d9dadcdf0d9a5519ce6ad3b2e3bb481"><code>5af127b</code></a> deps: update netty dependencies to v4.1.112.final (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3057">#3057</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/8ea0868e9e67a4c58075b98de0cf7b51635ea2f8"><code>8ea0868</code></a> deps: update dependency com.google.errorprone:error_prone_annotations to v2.2...</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/81b23dc88eeff492f6cef6328ce3b5d32992f500"><code>81b23dc</code></a> deps: update dependency com.google.errorprone:error_prone_annotations to v2.2...</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/2538334aff96a4ad70a26bac2141d3235856b1a1"><code>2538334</code></a> deps: update dependency org.apache.commons:commons-lang3 to v3.15.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3060">#3060</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/googleapis/sdk-platform-java/compare/v2.42.0...v2.43.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.google.api.grpc:proto-google-common-protos&package-manager=maven&previous-version=2.42.0&new-version=2.43.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/flight/flight-core/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml
index e4d1d5d3885..dec679de3a5 100644
--- a/java/flight/flight-core/pom.xml
+++ b/java/flight/flight-core/pom.xml
@@ -134,7 +134,7 @@ under the License.
     <dependency>
       <groupId>com.google.api.grpc</groupId>
       <artifactId>proto-google-common-protos</artifactId>
-      <version>2.42.0</version>
+      <version>2.43.0</version>
       <scope>test</scope>
     </dependency>
     <dependency>

From 9d40a6a6630f951b9ccf8e8984c58dc0602921eb Mon Sep 17 00:00:00 2001
From: "yihao.dai" <954206947@qq.com>
Date: Wed, 28 Aug 2024 23:32:10 +0800
Subject: [PATCH 087/157] GH-43860: [Go][Parquet] Handle the error correctly
 (#43861)

### Rationale for this change
Fixes: https://github.com/apache/arrow/issues/43860

### What changes are included in this PR?
Return error correctly

### Are these changes tested?
Yes

### Are there any user-facing changes?
Nope

* GitHub Issue: #43860

Authored-by: bigsheeper <yihao.dai@zilliz.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
---
 go/parquet/file/file_reader_test.go | 49 +++++++++++++++++++++++++++++
 go/parquet/file/record_reader.go    |  2 +-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/go/parquet/file/file_reader_test.go b/go/parquet/file/file_reader_test.go
index 35f4da4e866..74926c958e2 100644
--- a/go/parquet/file/file_reader_test.go
+++ b/go/parquet/file/file_reader_test.go
@@ -452,6 +452,55 @@ func TestRleBooleanEncodingFileRead(t *testing.T) {
 	assert.Equal(t, expected, values[:len(expected)])
 }
 
+type mockBadReader struct {
+	cnt    int
+	reader *os.File
+}
+
+func (m *mockBadReader) Seek(offset int64, whence int) (int64, error) {
+	return m.reader.Seek(offset, whence)
+}
+
+func (m *mockBadReader) ReadAt(p []byte, off int64) (n int, err error) {
+	if m.cnt == 0 {
+		return 0, fmt.Errorf("mock error")
+	}
+	m.cnt--
+	return m.reader.ReadAt(p, off)
+}
+
+func TestBadReader(t *testing.T) {
+	dir := os.Getenv("PARQUET_TEST_DATA")
+	if dir == "" {
+		t.Skip("no path supplied with PARQUET_TEST_DATA")
+	}
+	require.DirExists(t, dir)
+
+	filePath := path.Join(dir, "byte_stream_split_extended.gzip.parquet")
+	f, err := os.Open(filePath)
+	assert.NoError(t, err)
+	defer f.Close()
+
+	reader := &mockBadReader{
+		cnt:    2,
+		reader: f,
+	}
+	r, err := file.NewParquetReader(reader, file.WithReadProps(&parquet.ReaderProperties{
+		BufferSize:            int64(1024),
+		BufferedStreamEnabled: true,
+	}))
+	assert.NoError(t, err)
+
+	fileReader, err := pqarrow.NewFileReader(r, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator)
+	assert.NoError(t, err)
+
+	columnReader, err := fileReader.GetColumn(context.Background(), 0)
+	assert.NoError(t, err)
+
+	_, err = columnReader.NextBatch(1)
+	assert.ErrorContains(t, err, "mock error") // Expect an error to occur.
+}
+
 func TestByteStreamSplitEncodingFileRead(t *testing.T) {
 	dir := os.Getenv("PARQUET_TEST_DATA")
 	if dir == "" {
diff --git a/go/parquet/file/record_reader.go b/go/parquet/file/record_reader.go
index 667ffca77a8..765f4a9d34b 100755
--- a/go/parquet/file/record_reader.go
+++ b/go/parquet/file/record_reader.go
@@ -645,7 +645,7 @@ func (rr *recordReader) ReadRecords(numRecords int64) (int64, error) {
 		}
 	}
 
-	return recordsRead, nil
+	return recordsRead, rr.Err()
 }
 
 func (rr *recordReader) ReleaseValidBits() *memory.Buffer {

From 0bc91dd2447696a208adec266270ab722099b0e2 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Wed, 28 Aug 2024 15:07:02 -0300
Subject: [PATCH 088/157] GH-43854: [C++] Expose the set of device types where
 a ChunkedArray is allocated (#43853)

### Rationale for this change

`ChunkedArray`s allow flexible allocation of arrays -- the whole array doesn't have to be allocated in huge contiguous buffers. Nothing today prevents chunked arrays from being made of chunks allocated in different devices and that is good. But we need a way to query the set of devices where a chunked array is allocated at. This PR adds that missing part.

### What changes are included in this PR?

Addition of:
- the `DeviceAllocationTypeSet` class
- `ChunkedArray::device_types()`
- `Datum::device_types()`

Moved `enum DeviceAllocationType` to the `type_fwd.h` header because `device.h` is too expensive of a header to hold this widely used `enum`.

### Are these changes tested?

Added more asserts to `chunked_array_test.cc`.

### Are there any user-facing changes?

New APIs.
* GitHub Issue: #43854

Authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Signed-off-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
---
 cpp/src/arrow/CMakeLists.txt                |  1 +
 cpp/src/arrow/chunked_array.cc              | 13 +++
 cpp/src/arrow/chunked_array.h               |  8 ++
 cpp/src/arrow/chunked_array_test.cc         |  5 ++
 cpp/src/arrow/compute/function.cc           |  1 +
 cpp/src/arrow/compute/kernel.cc             |  1 +
 cpp/src/arrow/compute/kernel.h              |  1 +
 cpp/src/arrow/datum.cc                      | 40 +++++++++
 cpp/src/arrow/datum.h                       |  3 +
 cpp/src/arrow/device.h                      | 18 ----
 cpp/src/arrow/device_allocation_type_set.cc | 80 +++++++++++++++++
 cpp/src/arrow/device_allocation_type_set.h  | 97 +++++++++++++++++++++
 cpp/src/arrow/type_fwd.h                    | 21 +++++
 13 files changed, 271 insertions(+), 18 deletions(-)
 create mode 100644 cpp/src/arrow/device_allocation_type_set.cc
 create mode 100644 cpp/src/arrow/device_allocation_type_set.h

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 6b0ac8c23c7..65343df1291 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -373,6 +373,7 @@ set(ARROW_SRCS
     config.cc
     datum.cc
     device.cc
+    device_allocation_type_set.cc
     extension_type.cc
     extension/bool8.cc
     extension/uuid.cc
diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc
index c36b736d5d5..dd6aa51534f 100644
--- a/cpp/src/arrow/chunked_array.cc
+++ b/cpp/src/arrow/chunked_array.cc
@@ -27,6 +27,7 @@
 #include "arrow/array/array_nested.h"
 #include "arrow/array/util.h"
 #include "arrow/array/validate.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/pretty_print.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
@@ -86,6 +87,18 @@ Result<std::shared_ptr<ChunkedArray>> ChunkedArray::MakeEmpty(
   return std::make_shared<ChunkedArray>(std::move(new_chunks));
 }
 
+DeviceAllocationTypeSet ChunkedArray::device_types() const {
+  if (chunks_.empty()) {
+    // An empty ChunkedArray is considered to be CPU-only.
+    return DeviceAllocationTypeSet::CpuOnly();
+  }
+  DeviceAllocationTypeSet set;
+  for (const auto& chunk : chunks_) {
+    set.add(chunk->device_type());
+  }
+  return set;
+}
+
 bool ChunkedArray::Equals(const ChunkedArray& other, const EqualOptions& opts) const {
   if (length_ != other.length()) {
     return false;
diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h
index 5d300861d85..c65b6cb6e22 100644
--- a/cpp/src/arrow/chunked_array.h
+++ b/cpp/src/arrow/chunked_array.h
@@ -25,6 +25,7 @@
 
 #include "arrow/chunk_resolver.h"
 #include "arrow/compare.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/type_fwd.h"
@@ -116,6 +117,13 @@ class ARROW_EXPORT ChunkedArray {
   /// \return an ArrayVector of chunks
   const ArrayVector& chunks() const { return chunks_; }
 
+  /// \return The set of device allocation types used by the chunks in this
+  /// chunked array.
+  DeviceAllocationTypeSet device_types() const;
+
+  /// \return true if all chunks are allocated on CPU-accessible memory.
+  bool is_cpu() const { return device_types().is_cpu_only(); }
+
   /// \brief Construct a zero-copy slice of the chunked array with the
   /// indicated offset and length
   ///
diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc
index e9cc283b53c..b796e925000 100644
--- a/cpp/src/arrow/chunked_array_test.cc
+++ b/cpp/src/arrow/chunked_array_test.cc
@@ -61,12 +61,17 @@ TEST_F(TestChunkedArray, Make) {
                        ChunkedArray::Make({}, int64()));
   AssertTypeEqual(*int64(), *result->type());
   ASSERT_EQ(result->num_chunks(), 0);
+  // Empty chunked arrays are treated as CPU-allocated.
+  ASSERT_TRUE(result->is_cpu());
 
   auto chunk0 = ArrayFromJSON(int8(), "[0, 1, 2]");
   auto chunk1 = ArrayFromJSON(int16(), "[3, 4, 5]");
 
   ASSERT_OK_AND_ASSIGN(result, ChunkedArray::Make({chunk0, chunk0}));
   ASSERT_OK_AND_ASSIGN(auto result2, ChunkedArray::Make({chunk0, chunk0}, int8()));
+  // All chunks are CPU-accessible.
+  ASSERT_TRUE(result->is_cpu());
+  ASSERT_TRUE(result2->is_cpu());
   AssertChunkedEqual(*result, *result2);
 
   ASSERT_RAISES(TypeError, ChunkedArray::Make({chunk0, chunk1}));
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index e1a2e8c5d88..0478a3d1e80 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -30,6 +30,7 @@
 #include "arrow/compute/kernels/common_internal.h"
 #include "arrow/compute/registry.h"
 #include "arrow/datum.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/tracing_internal.h"
diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index 5c87ef2cd05..5e7461cc52d 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -24,6 +24,7 @@
 
 #include "arrow/buffer.h"
 #include "arrow/compute/exec.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/result.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 1adb3e96c97..cfa1cd8193f 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -31,6 +31,7 @@
 #include "arrow/buffer.h"
 #include "arrow/compute/exec.h"
 #include "arrow/datum.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/memory_pool.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
diff --git a/cpp/src/arrow/datum.cc b/cpp/src/arrow/datum.cc
index 2ac230232e1..b19d1864475 100644
--- a/cpp/src/arrow/datum.cc
+++ b/cpp/src/arrow/datum.cc
@@ -25,6 +25,7 @@
 #include "arrow/array/array_base.h"
 #include "arrow/array/util.h"
 #include "arrow/chunked_array.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/record_batch.h"
 #include "arrow/scalar.h"
 #include "arrow/table.h"
@@ -156,6 +157,45 @@ ArrayVector Datum::chunks() const {
   return this->chunked_array()->chunks();
 }
 
+DeviceAllocationTypeSet Datum::device_types() const {
+  switch (kind()) {
+    case NONE:
+      break;
+    case SCALAR:
+      // Scalars are asssumed as always residing in CPU memory for now.
+      return DeviceAllocationTypeSet::CpuOnly();
+    case ARRAY:
+      return DeviceAllocationTypeSet{array()->device_type()};
+    case CHUNKED_ARRAY:
+      return chunked_array()->device_types();
+    case RECORD_BATCH: {
+      auto& columns = record_batch()->columns();
+      if (columns.empty()) {
+        // An empty RecordBatch is considered to be CPU-only.
+        return DeviceAllocationTypeSet::CpuOnly();
+      }
+      DeviceAllocationTypeSet set;
+      for (const auto& column : columns) {
+        set.add(column->device_type());
+      }
+      return set;
+    }
+    case TABLE: {
+      auto& columns = table()->columns();
+      if (columns.empty()) {
+        // An empty Table is considered to be CPU-only.
+        return DeviceAllocationTypeSet::CpuOnly();
+      }
+      DeviceAllocationTypeSet set;
+      for (const auto& column : columns) {
+        set.Add(column->device_types());
+      }
+      return set;
+    }
+  }
+  return {};
+}
+
 bool Datum::Equals(const Datum& other) const {
   if (this->kind() != other.kind()) return false;
 
diff --git a/cpp/src/arrow/datum.h b/cpp/src/arrow/datum.h
index 31b2d2274c9..4a88e7a8112 100644
--- a/cpp/src/arrow/datum.h
+++ b/cpp/src/arrow/datum.h
@@ -26,6 +26,7 @@
 #include <vector>
 
 #include "arrow/array/data.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/scalar.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
@@ -295,6 +296,8 @@ struct ARROW_EXPORT Datum {
   /// \return empty if not arraylike
   ArrayVector chunks() const;
 
+  DeviceAllocationTypeSet device_types() const;
+
   /// \brief True if the two data are equal
   bool Equals(const Datum& other) const;
 
diff --git a/cpp/src/arrow/device.h b/cpp/src/arrow/device.h
index f5cca0d27d7..1dbe5b4b13e 100644
--- a/cpp/src/arrow/device.h
+++ b/cpp/src/arrow/device.h
@@ -32,24 +32,6 @@
 
 namespace arrow {
 
-/// \brief EXPERIMENTAL: Device type enum which matches up with C Data Device types
-enum class DeviceAllocationType : char {
-  kCPU = 1,
-  kCUDA = 2,
-  kCUDA_HOST = 3,
-  kOPENCL = 4,
-  kVULKAN = 7,
-  kMETAL = 8,
-  kVPI = 9,
-  kROCM = 10,
-  kROCM_HOST = 11,
-  kEXT_DEV = 12,
-  kCUDA_MANAGED = 13,
-  kONEAPI = 14,
-  kWEBGPU = 15,
-  kHEXAGON = 16,
-};
-
 class MemoryManager;
 
 /// \brief EXPERIMENTAL: Abstract interface for hardware devices
diff --git a/cpp/src/arrow/device_allocation_type_set.cc b/cpp/src/arrow/device_allocation_type_set.cc
new file mode 100644
index 00000000000..83e9e57f2ee
--- /dev/null
+++ b/cpp/src/arrow/device_allocation_type_set.cc
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <string>
+
+#include "arrow/device_allocation_type_set.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+
+const char* DeviceAllocationTypeToCStr(DeviceAllocationType type) {
+  switch (type) {
+    case DeviceAllocationType::kCPU:
+      return "CPU";
+    case DeviceAllocationType::kCUDA:
+      return "CUDA";
+    case DeviceAllocationType::kCUDA_HOST:
+      return "CUDA_HOST";
+    case DeviceAllocationType::kOPENCL:
+      return "OPENCL";
+    case DeviceAllocationType::kVULKAN:
+      return "VULKAN";
+    case DeviceAllocationType::kMETAL:
+      return "METAL";
+    case DeviceAllocationType::kVPI:
+      return "VPI";
+    case DeviceAllocationType::kROCM:
+      return "ROCM";
+    case DeviceAllocationType::kROCM_HOST:
+      return "ROCM_HOST";
+    case DeviceAllocationType::kEXT_DEV:
+      return "EXT_DEV";
+    case DeviceAllocationType::kCUDA_MANAGED:
+      return "CUDA_MANAGED";
+    case DeviceAllocationType::kONEAPI:
+      return "ONEAPI";
+    case DeviceAllocationType::kWEBGPU:
+      return "WEBGPU";
+    case DeviceAllocationType::kHEXAGON:
+      return "HEXAGON";
+  }
+  return "<UNKNOWN>";
+}
+
+std::string DeviceAllocationTypeSet::ToString() const {
+  std::string result = "{";
+  for (int i = 1; i <= kDeviceAllocationTypeMax; i++) {
+    if (device_type_bitset_.test(i)) {
+      // Skip all the unused values in the enum.
+      switch (i) {
+        case 0:
+        case 5:
+        case 6:
+          continue;
+      }
+      if (result.size() > 1) {
+        result += ", ";
+      }
+      result += DeviceAllocationTypeToCStr(static_cast<DeviceAllocationType>(i));
+    }
+  }
+  result += "}";
+  return result;
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/device_allocation_type_set.h b/cpp/src/arrow/device_allocation_type_set.h
new file mode 100644
index 00000000000..974367307e6
--- /dev/null
+++ b/cpp/src/arrow/device_allocation_type_set.h
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <bitset>
+#include <string>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+ARROW_EXPORT
+const char* DeviceAllocationTypeToCStr(DeviceAllocationType type);
+
+class ARROW_EXPORT DeviceAllocationTypeSet {
+ private:
+  std::bitset<kDeviceAllocationTypeMax + 1> device_type_bitset_;
+
+ public:
+  /// \brief Construct an empty set of device types.
+  DeviceAllocationTypeSet() = default;
+
+  /// \brief Construct a set of device types with a single device type.
+  DeviceAllocationTypeSet(  // NOLINT implicit construction
+      DeviceAllocationType accepted_device_type) {
+    add(accepted_device_type);
+  }
+
+  /// \brief Construct a set of device types containing only "kCPU".
+  static DeviceAllocationTypeSet CpuOnly() {
+    return DeviceAllocationTypeSet{DeviceAllocationType::kCPU};
+  }
+
+  /// \brief Construct a set of device types containing all device types.
+  static DeviceAllocationTypeSet All() {
+    DeviceAllocationTypeSet all;
+    all.device_type_bitset_.set();
+    // Don't set the invalid enum values.
+    all.device_type_bitset_.reset(0);
+    all.device_type_bitset_.reset(5);
+    all.device_type_bitset_.reset(6);
+    return all;
+  }
+
+  /// \brief Add a device type to the set of device types.
+  void add(DeviceAllocationType device_type) {
+    device_type_bitset_.set(static_cast<int>(device_type));
+  }
+
+  /// \brief Remove a device type from the set of device types.
+  void remove(DeviceAllocationType device_type) {
+    device_type_bitset_.reset(static_cast<int>(device_type));
+  }
+
+  /// \brief Return true iff the set only contains the CPU device type.
+  bool is_cpu_only() const {
+    return device_type_bitset_ == CpuOnly().device_type_bitset_;
+  }
+
+  /// \brief Return true if the set of accepted device types includes the
+  /// device type.
+  bool contains(DeviceAllocationType device_type) const {
+    return device_type_bitset_.test(static_cast<int>(device_type));
+  }
+
+  /// \brief Add all device types from another set to this set.
+  void Add(DeviceAllocationTypeSet other) {
+    device_type_bitset_ |= other.device_type_bitset_;
+  }
+
+  /// \brief Return true if the set of accepted device types includes all the
+  /// device types in the other set.
+  bool Contains(DeviceAllocationTypeSet other) const {
+    // other \subseteq this <==> (other \intersect this == other)
+    return (other.device_type_bitset_ & device_type_bitset_) == other.device_type_bitset_;
+  }
+
+  std::string ToString() const;
+};
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index 08777d247ed..8faebe217f1 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -724,4 +724,25 @@ ARROW_EXPORT MemoryPool* default_memory_pool();
 
 constexpr int64_t kDefaultBufferAlignment = 64;
 
+/// \brief EXPERIMENTAL: Device type enum which matches up with C Data Device types
+enum class DeviceAllocationType : char {
+  kCPU = 1,
+  kCUDA = 2,
+  kCUDA_HOST = 3,
+  kOPENCL = 4,
+  kVULKAN = 7,
+  kMETAL = 8,
+  kVPI = 9,
+  kROCM = 10,
+  kROCM_HOST = 11,
+  kEXT_DEV = 12,
+  kCUDA_MANAGED = 13,
+  kONEAPI = 14,
+  kWEBGPU = 15,
+  kHEXAGON = 16,
+};
+constexpr int kDeviceAllocationTypeMax = 16;
+
+class DeviceAllocationTypeSet;
+
 }  // namespace arrow

From 58415d1fac50cb829b3dcf08526033d6db8c30db Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 29 Aug 2024 02:54:32 +0200
Subject: [PATCH 089/157] GH-38183: [CI][Python] Use pipx to install GCS
 testbench (#43852)

### Rationale for this change

Installing the GCS testbench using the same Python that's being used to test PyArrow is fragile: some testbench versions may not be compatible, or there could be conflicts among the dependencies of the respective libraries.

### What changes are included in this PR?

Use `pipx` to install the GCS testbench in a separate, controlled environment, using an appropriate Python version.

### Are these changes tested?

Yes, by CI.

### Are there any user-facing changes?

No.

* GitHub Issue: #38183

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/cpp.yml                     |  8 ++-
 appveyor.yml                                  |  1 +
 ci/appveyor-cpp-build.bat                     |  2 +
 ci/docker/conda-cpp.dockerfile                | 12 ++--
 ci/docker/conda-python.dockerfile             |  5 --
 ...ython-wheel-windows-test-vs2019.dockerfile | 27 +++++---
 ci/docker/ubuntu-20.04-cpp-minimal.dockerfile |  1 +
 ci/docker/ubuntu-22.04-cpp-minimal.dockerfile |  1 +
 ci/docker/ubuntu-24.04-cpp-minimal.dockerfile |  1 +
 ci/scripts/install_gcs_testbench.bat          | 13 +++-
 ci/scripts/install_gcs_testbench.sh           | 20 +++---
 ci/scripts/python_wheel_windows_test.bat      | 40 ++++++-----
 cpp/src/arrow/filesystem/gcsfs_test.cc        | 68 +++++++++----------
 python/pyarrow/tests/conftest.py              |  7 +-
 python/scripts/run_emscripten_tests.py        |  2 +-
 r/tests/testthat/test-gcs.R                   |  4 +-
 16 files changed, 122 insertions(+), 90 deletions(-)

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index c5482f73082..fd23e0cf217 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -465,15 +465,17 @@ jobs:
           chmod +x /usr/local/bin/minio.exe
       - name: Set up Python
         uses: actions/setup-python@v5.1.1
+        id: python-install
         with:
           python-version: 3.9
       - name: Install Google Cloud Storage Testbench
-        shell: bash
+        shell: msys2 {0}
+        env:
+          PIPX_BIN_DIR: /usr/local/bin
+          PIPX_PYTHON: ${{ steps.python-install.outputs.python-path }}
         run: |
           ci/scripts/install_gcs_testbench.sh default
-          echo "PYTHON_BIN_DIR=$(cygpath --windows $(dirname $(which python3.exe)))" >> $GITHUB_ENV
       - name: Test
         shell: msys2 {0}
         run: |
-          PATH="$(cygpath --unix ${PYTHON_BIN_DIR}):${PATH}"
           ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build"
diff --git a/appveyor.yml b/appveyor.yml
index 5954251d347..9e4582f1d8d 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -24,6 +24,7 @@ only_commits:
     - appveyor.yml
     - ci/appveyor*
     - ci/conda*
+    - ci/scripts/*.bat
     - cpp/
     - format/
     - python/
diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat
index f688fbb63a9..08a052e82f2 100644
--- a/ci/appveyor-cpp-build.bat
+++ b/ci/appveyor-cpp-build.bat
@@ -46,7 +46,9 @@ set ARROW_CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=CONDA -DARROW_WITH_BZ2=ON
 set ARROW_CXXFLAGS=/WX /MP
 
 @rem Install GCS testbench
+set PIPX_BIN_DIR=C:\Windows\
 call %CD%\ci\scripts\install_gcs_testbench.bat
+storage-testbench -h || exit /B
 
 @rem
 @rem Build and test Arrow C++ libraries (including Parquet)
diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile
index dff1f222480..eb035d887a1 100644
--- a/ci/docker/conda-cpp.dockerfile
+++ b/ci/docker/conda-cpp.dockerfile
@@ -42,17 +42,19 @@ RUN mamba install -q -y \
         valgrind && \
     mamba clean --all
 
+# We want to install the GCS testbench using the Conda base environment's Python,
+# because the test environment's Python may later change.
+ENV PIPX_PYTHON=/opt/conda/bin/python3
+COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
+RUN /arrow/ci/scripts/install_gcs_testbench.sh default
+
 # Ensure npm, node and azurite are on path. npm and node are required to install azurite, which will then need to 
-# be on the path for the tests to run.  
+# be on the path for the tests to run.
 ENV PATH=/opt/conda/envs/arrow/bin:$PATH
 
 COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/
 RUN /arrow/ci/scripts/install_azurite.sh
 
-# We want to install the GCS testbench using the same Python binary that the Conda code will use.
-COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
-RUN /arrow/ci/scripts/install_gcs_testbench.sh default
-
 COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/
 RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin
 
diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile
index 027fd589cec..7e8dbe76f62 100644
--- a/ci/docker/conda-python.dockerfile
+++ b/ci/docker/conda-python.dockerfile
@@ -32,11 +32,6 @@ RUN mamba install -q -y \
         nomkl && \
     mamba clean --all
 
-# XXX The GCS testbench was already installed in conda-cpp.dockerfile,
-# but we changed the installed Python version above, so we need to reinstall it.
-COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
-RUN /arrow/ci/scripts/install_gcs_testbench.sh default
-
 ENV ARROW_ACERO=ON \
     ARROW_BUILD_STATIC=OFF \
     ARROW_BUILD_TESTS=OFF \
diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
index 5f488a4c285..625ab25f848 100644
--- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile
+++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
@@ -35,16 +35,27 @@ RUN setx path "%path%;C:\Program Files\Git\usr\bin"
 RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \
     rm -rf Python*
 
+# Install the GCS testbench using a well-known Python version.
+# NOTE: cannot use pipx's `--fetch-missing-python` because of
+# https://github.com/pypa/pipx/issues/1521, therefore download Python ourselves.
+RUN choco install -r -y --pre --no-progress python --version=3.11.9
+ENV PIPX_BIN_DIR=C:\\Windows\\
+ENV PIPX_PYTHON="C:\Python311\python.exe"
+COPY ci/scripts/install_gcs_testbench.bat C:/arrow/ci/scripts/
+RUN call "C:\arrow\ci\scripts\install_gcs_testbench.bat" && \
+    storage-testbench -h
+
 # Define the full version number otherwise choco falls back to patch number 0 (3.8 => 3.8.0)
 ARG python=3.8
-RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \
-    (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \
-    (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \
-    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
-    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \
-    (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts")
+RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10") & \
+    (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13") & \
+    (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11") & \
+    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9") & \
+    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4") & \
+    (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1")
 
 # Install archiver to extract xz archives
-RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% & \
-    python -m pip install --no-cache-dir -U pip setuptools & \
+RUN choco install -r -y --pre --no-progress --force python --version=%PYTHON_VERSION% && \
     choco install --no-progress -r -y archiver
+
+ENV PYTHON=$python
diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
index e17c0306f11..4d867a448c9 100644
--- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
         libssl-dev \
         libcurl4-openssl-dev \
         python3-pip \
+        python3-venv \
         tzdata \
         wget && \
     apt-get clean && \
diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
index 341d8a87e86..f26cad51f09 100644
--- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
         libssl-dev \
         libcurl4-openssl-dev \
         python3-pip \
+        python3-venv \
         tzdata \
         wget && \
     apt-get clean && \
diff --git a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
index a995ab2a8bc..125bc7ba46a 100644
--- a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
         libssl-dev \
         libcurl4-openssl-dev \
         python3-pip \
+        python3-venv \
         tzdata \
         tzdata-legacy \
         wget && \
diff --git a/ci/scripts/install_gcs_testbench.bat b/ci/scripts/install_gcs_testbench.bat
index b03d0c2ad66..f54f98db7ca 100644
--- a/ci/scripts/install_gcs_testbench.bat
+++ b/ci/scripts/install_gcs_testbench.bat
@@ -17,9 +17,18 @@
 
 @echo on
 
-set GCS_TESTBENCH_VERSION="v0.36.0"
+set GCS_TESTBENCH_VERSION="v0.40.0"
+
+set PIPX_FLAGS=--verbose
+if NOT "%PIPX_PYTHON%"=="" (
+  set PIPX_FLAGS=--python %PIPX_PYTHON% %PIPX_FLAGS%
+)
+
+python -m pip install -U pipx || exit /B 1
 
 @REM Install GCS testbench %GCS_TESTBENCH_VERSION%
-python -m pip install  ^
+pipx install %PIPX_FLAGS% ^
         "https://github.com/googleapis/storage-testbench/archive/%GCS_TESTBENCH_VERSION%.tar.gz" ^
         || exit /B 1
+
+pipx list --verbose
diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh
index 5471b3cc238..78826e94d32 100755
--- a/ci/scripts/install_gcs_testbench.sh
+++ b/ci/scripts/install_gcs_testbench.sh
@@ -17,7 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
+set -ex
 
 if [ "$#" -ne 1 ]; then
   echo "Usage: $0 <storage-testbench version>"
@@ -34,19 +34,23 @@ case "$(uname -m)" in
     ;;
 esac
 
-# On newer pythons install into the system will fail, so override that
-export PIP_BREAK_SYSTEM_PACKAGES=1
-
 version=$1
 if [[ "${version}" -eq "default" ]]; then
   version="v0.39.0"
-  # Latests versions of Testbench require newer setuptools
-  python3 -m pip install --upgrade setuptools
 fi
 
+: ${PIPX_PYTHON:=$(which python3)}
+
+export PIP_BREAK_SYSTEM_PACKAGES=1
+${PIPX_PYTHON} -m pip install -U pipx
+
 # This script is run with PYTHON undefined in some places,
 # but those only use older pythons.
 if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then
-  python3 -m pip install \
-    "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
+  pipx_flags=--verbose
+  if [[ $(id -un) == "root" ]]; then
+    # Install globally as /root/.local/bin is typically not in $PATH
+    pipx_flags="${pipx_flags} --global"
+  fi
+  ${PIPX_PYTHON} -m pipx install ${pipx_flags} "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
 fi
diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat
index 87c0bb12520..cac3f18434b 100755
--- a/ci/scripts/python_wheel_windows_test.bat
+++ b/ci/scripts/python_wheel_windows_test.bat
@@ -37,28 +37,32 @@ set PYARROW_TEST_TENSORFLOW=ON
 set ARROW_TEST_DATA=C:\arrow\testing\data
 set PARQUET_TEST_DATA=C:\arrow\cpp\submodules\parquet-testing\data
 
-@REM Install testing dependencies
-pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1
+@REM List installed Pythons
+py -0p
+
+set PYTHON_CMD=py -%PYTHON%
 
-@REM Install GCS testbench
-call "C:\arrow\ci\scripts\install_gcs_testbench.bat"
+%PYTHON_CMD% -m pip install -U pip setuptools || exit /B 1
+
+@REM Install testing dependencies
+%PYTHON_CMD% -m pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1
 
 @REM Install the built wheels
-python -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1 
+%PYTHON_CMD% -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1
 
 @REM Test that the modules are importable
-python -c "import pyarrow" || exit /B 1
-python -c "import pyarrow._gcsfs" || exit /B 1
-python -c "import pyarrow._hdfs" || exit /B 1 
-python -c "import pyarrow._s3fs" || exit /B 1
-python -c "import pyarrow.csv" || exit /B 1
-python -c "import pyarrow.dataset" || exit /B 1
-python -c "import pyarrow.flight" || exit /B 1
-python -c "import pyarrow.fs" || exit /B 1
-python -c "import pyarrow.json" || exit /B 1
-python -c "import pyarrow.orc" || exit /B 1
-python -c "import pyarrow.parquet" || exit /B 1
-python -c "import pyarrow.substrait" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow._gcsfs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow._hdfs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow._s3fs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.csv" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.dataset" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.flight" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.fs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.json" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.orc" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1
 
 @rem Download IANA Timezone Database for ORC C++
 curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B
@@ -67,4 +71,4 @@ arc unarchive tzdata.tar.xz %USERPROFILE%\Downloads\test\tzdata
 set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo
 
 @REM Execute unittest
-pytest -r s --pyargs pyarrow || exit /B 1
+%PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1
diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc
index a6022a8d216..2098cf4d7f3 100644
--- a/cpp/src/arrow/filesystem/gcsfs_test.cc
+++ b/cpp/src/arrow/filesystem/gcsfs_test.cc
@@ -95,44 +95,41 @@ class GcsTestbench : public ::testing::Environment {
     if (const auto* env = std::getenv("PYTHON")) {
       names = {env};
     }
-    auto error = std::string(
-        "Could not start GCS emulator."
-        " Used the following list of python interpreter names:");
-    for (const auto& interpreter : names) {
-      auto exe_path = bp::search_path(interpreter);
-      error += " " + interpreter;
-      if (exe_path.empty()) {
-        error += " (exe not found)";
-        continue;
-      }
+    auto error = std::string("Could not start GCS emulator 'storage-testbench'");
 
-      bp::ipstream output;
-      server_process_ = bp::child(exe_path, "-m", "testbench", "--port", port_, group_,
-                                  bp::std_err > output);
+    auto testbench_is_running = [](bp::child& process, bp::ipstream& output) {
       // Wait for message: "* Restarting with"
-      auto testbench_is_running = [&output, this](bp::child& process) {
-        std::string line;
-        std::chrono::time_point<std::chrono::steady_clock> end =
-            std::chrono::steady_clock::now() + std::chrono::seconds(10);
-        while (server_process_.valid() && server_process_.running() &&
-               std::chrono::steady_clock::now() < end) {
-          if (output.peek() && std::getline(output, line)) {
-            std::cerr << line << std::endl;
-            if (line.find("* Restarting with") != std::string::npos) return true;
-          } else {
-            std::this_thread::sleep_for(std::chrono::milliseconds(20));
-          }
+      std::string line;
+      std::chrono::time_point<std::chrono::steady_clock> end =
+          std::chrono::steady_clock::now() + std::chrono::seconds(10);
+      while (process.valid() && process.running() &&
+             std::chrono::steady_clock::now() < end) {
+        if (output.peek() && std::getline(output, line)) {
+          std::cerr << line << std::endl;
+          if (line.find("* Restarting with") != std::string::npos) return true;
+        } else {
+          std::this_thread::sleep_for(std::chrono::milliseconds(20));
         }
-        return false;
-      };
+      }
+      return false;
+    };
 
-      if (testbench_is_running(server_process_)) break;
-      error += " (failed to start)";
-      server_process_.terminate();
-      server_process_.wait();
+    auto exe_path = bp::search_path("storage-testbench");
+    if (!exe_path.empty()) {
+      bp::ipstream output;
+      server_process_ =
+          bp::child(exe_path, "--port", port_, group_, bp::std_err > output);
+      if (!testbench_is_running(server_process_, output)) {
+        error += " (failed to start)";
+        server_process_.terminate();
+        server_process_.wait();
+      }
+    } else {
+      error += " (exe not found)";
+    }
+    if (!server_process_.valid()) {
+      error_ = std::move(error);
     }
-    if (server_process_.valid() && server_process_.valid()) return;
-    error_ = std::move(error);
   }
 
   bool running() { return server_process_.running(); }
@@ -140,7 +137,10 @@ class GcsTestbench : public ::testing::Environment {
   ~GcsTestbench() override {
     // Brutal shutdown, kill the full process group because the GCS testbench may launch
     // additional children.
-    group_.terminate();
+    try {
+      group_.terminate();
+    } catch (bp::process_error&) {
+    }
     if (server_process_.valid()) {
       server_process_.wait();
     }
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index e1919497b51..7a222cec8a7 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -233,17 +233,16 @@ def minio_server_health_check(address):
 def gcs_server():
     port = find_free_port()
     env = os.environ.copy()
-    args = [sys.executable, '-m', 'testbench', '--port', str(port)]
+    exe = 'storage-testbench'
+    args = [exe, '--port', str(port)]
     proc = None
     try:
-        # check first if testbench module is available
-        import testbench  # noqa:F401
         # start server
         proc = subprocess.Popen(args, env=env)
         # Make sure the server is alive.
         if proc.poll() is not None:
             pytest.skip(f"Command {args} did not start server successfully!")
-    except (ModuleNotFoundError, OSError) as e:
+    except OSError as e:
         pytest.skip(f"Command {args} failed to execute: {e}")
     else:
         yield {
diff --git a/python/scripts/run_emscripten_tests.py b/python/scripts/run_emscripten_tests.py
index 1a4b4a4e056..53d3dd52bd8 100644
--- a/python/scripts/run_emscripten_tests.py
+++ b/python/scripts/run_emscripten_tests.py
@@ -335,7 +335,7 @@ def _load_pyarrow_in_runner(driver, wheel_name):
         """
 import pyarrow,pathlib
 pyarrow_dir = pathlib.Path(pyarrow.__file__).parent
-pytest.main([pyarrow_dir, '-v'])
+pytest.main([pyarrow_dir, '-r', 's'])
 """,
         wait_for_terminate=False,
     )
diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R
index d671c12138c..54159e82ca6 100644
--- a/r/tests/testthat/test-gcs.R
+++ b/r/tests/testthat/test-gcs.R
@@ -116,12 +116,12 @@ test_that("GcsFileSystem$create() can read json_credentials", {
 })
 
 skip_on_cran()
-skip_if_not(system('python -c "import testbench"') == 0, message = "googleapis-storage-testbench is not installed.")
+skip_if_not(system("storage-testbench -h") == 0, message = "googleapis-storage-testbench is not installed.")
 library(dplyr)
 
 testbench_port <- Sys.getenv("TESTBENCH_PORT", "9001")
 
-pid_minio <- sys::exec_background("python", c("-m", "testbench", "--port", testbench_port),
+pid_minio <- sys::exec_background("storage-testbench", c("--port", testbench_port),
   std_out = FALSE,
   std_err = FALSE # TODO: is there a good place to send output?
 )

From 6c17b794509d3931225cf295ae864204162c786f Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 29 Aug 2024 17:53:14 +0900
Subject: [PATCH 090/157] GH-43877: [Ruby] Add support for 0 decimal value
 (#43882)

### Rationale for this change

Apache Arrow C++ may use "0.EXXX" string such as "0.E-9" for 0 decimal value. Ruby's BigDecimal doesn't accept it.

### What changes are included in this PR?

We convert "0.EXXX" to "0.0EXXX" in Ruby.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #43877

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ruby/red-arrow/lib/arrow/decimal128-array.rb | 4 +++-
 ruby/red-arrow/lib/arrow/decimal256-array.rb | 4 +++-
 ruby/red-arrow/test/test-decimal128-array.rb | 6 ++++++
 ruby/red-arrow/test/test-decimal256-array.rb | 6 ++++++
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/ruby/red-arrow/lib/arrow/decimal128-array.rb b/ruby/red-arrow/lib/arrow/decimal128-array.rb
index a5ee53be7b2..528c878a859 100644
--- a/ruby/red-arrow/lib/arrow/decimal128-array.rb
+++ b/ruby/red-arrow/lib/arrow/decimal128-array.rb
@@ -18,7 +18,9 @@
 module Arrow
   class Decimal128Array
     def get_value(i)
-      BigDecimal(format_value(i))
+      string = format_value(i)
+      string.sub!(".E", ".0E") if string.include?(".E")
+      BigDecimal(string)
     end
   end
 end
diff --git a/ruby/red-arrow/lib/arrow/decimal256-array.rb b/ruby/red-arrow/lib/arrow/decimal256-array.rb
index 8c2306dfe36..32841ca4862 100644
--- a/ruby/red-arrow/lib/arrow/decimal256-array.rb
+++ b/ruby/red-arrow/lib/arrow/decimal256-array.rb
@@ -19,7 +19,9 @@ module Arrow
   class Decimal256Array
     # @since 3.0.0
     def get_value(i)
-      BigDecimal(format_value(i))
+      string = format_value(i)
+      string.sub!(".E", ".0E") if string.include?(".E")
+      BigDecimal(string)
     end
   end
 end
diff --git a/ruby/red-arrow/test/test-decimal128-array.rb b/ruby/red-arrow/test/test-decimal128-array.rb
index a50e2cf4a48..a6e7c4e1ac4 100644
--- a/ruby/red-arrow/test/test-decimal128-array.rb
+++ b/ruby/red-arrow/test/test-decimal128-array.rb
@@ -38,4 +38,10 @@ class Decimal128ArrayTest < Test::Unit::TestCase
                    array.to_a)
     end
   end
+
+  def test_zero
+    array = Arrow::Decimal128Array.new({precision: 38, scale: 9},
+                                       [BigDecimal("0")])
+    assert_equal(BigDecimal("0"), array[0])
+  end
 end
diff --git a/ruby/red-arrow/test/test-decimal256-array.rb b/ruby/red-arrow/test/test-decimal256-array.rb
index ed542f2d6c7..053e948fc84 100644
--- a/ruby/red-arrow/test/test-decimal256-array.rb
+++ b/ruby/red-arrow/test/test-decimal256-array.rb
@@ -38,4 +38,10 @@ class Decimal256ArrayTest < Test::Unit::TestCase
                    array.to_a)
     end
   end
+
+  def test_zero
+    array = Arrow::Decimal256Array.new({precision: 38, scale: 9},
+                                       [BigDecimal("0")])
+    assert_equal(BigDecimal("0"), array[0])
+  end
 end

From 30893876e0650d9c3c003c5646f94c274ade9669 Mon Sep 17 00:00:00 2001
From: Rossi Sun <zanmato1984@gmail.com>
Date: Thu, 29 Aug 2024 19:09:52 +0800
Subject: [PATCH 091/157] GH-43870: [C++][Acero] Fix typos in join benchmark
 (#43871)

### Rationale for this change

These are rather obvious typos.

### What changes are included in this PR?

### Are these changes tested?

### Are there any user-facing changes?

* GitHub Issue: #43870

Authored-by: Ruoxi Sun <zanmato1984@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/acero/hash_join_benchmark.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc
index 470960b1c50..e3e37e249e6 100644
--- a/cpp/src/arrow/acero/hash_join_benchmark.cc
+++ b/cpp/src/arrow/acero/hash_join_benchmark.cc
@@ -104,7 +104,7 @@ class JoinBenchmark {
       key_cmp.push_back(JoinKeyCmp::EQ);
     }
 
-    for (size_t i = 0; i < settings.build_payload_types.size(); i++) {
+    for (size_t i = 0; i < settings.probe_payload_types.size(); i++) {
       std::string name = "lp" + std::to_string(i);
       DCHECK_OK(l_schema_builder.AddField(field(name, settings.probe_payload_types[i])));
     }
@@ -279,7 +279,7 @@ static void BM_HashJoinBasic_MatchesPerRow(benchmark::State& st) {
   settings.cardinality = 1.0 / static_cast<double>(st.range(0));
 
   settings.num_build_batches = static_cast<int>(st.range(1));
-  settings.num_probe_batches = settings.num_probe_batches;
+  settings.num_probe_batches = settings.num_build_batches;
 
   HashJoinBasicBenchmarkImpl(st, settings);
 }
@@ -291,7 +291,7 @@ static void BM_HashJoinBasic_PayloadSize(benchmark::State& st) {
   settings.cardinality = 1.0 / static_cast<double>(st.range(1));
 
   settings.num_build_batches = static_cast<int>(st.range(2));
-  settings.num_probe_batches = settings.num_probe_batches;
+  settings.num_probe_batches = settings.num_build_batches;
 
   HashJoinBasicBenchmarkImpl(st, settings);
 }

From 6db12f2ca7cccb5f90e1cd0e753d5e92fe3b17bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Thu, 29 Aug 2024 13:36:17 +0200
Subject: [PATCH 092/157] GH-41696: [Python][Packaging] Bump
 MACOSX_DEPLOYMENT_TARGET to 12 instead of 11 (#43137)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

As shown on the associated issue there seems to be a problem with `MACOSX_DEPLOYMENT_TARGET` 11 on the wheels.

### What changes are included in this PR?

Update `MACOSX_DEPLOYMENT_TARGET` everywhere to the latest supported macOS version.

### Are these changes tested?

Via CI, even though the issue was not reproducible on CI.

### Are there any user-facing changes?

Yes, wheels won't be available for macOS 11 but those were crashing on the previous release.
* GitHub Issue: #41696

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
---
 .github/workflows/python.yml                 |  2 +-
 ci/scripts/python_wheel_macos_build.sh       |  2 +-
 ci/vcpkg/arm64-osx-static-debug.cmake        |  2 +-
 ci/vcpkg/arm64-osx-static-release.cmake      |  2 +-
 ci/vcpkg/universal2-osx-static-debug.cmake   |  2 +-
 ci/vcpkg/universal2-osx-static-release.cmake |  2 +-
 cpp/src/arrow/flight/CMakeLists.txt          |  6 ++++++
 dev/tasks/tasks.yml                          | 10 +++++-----
 dev/tasks/verify-rc/github.macos.yml         |  2 +-
 python/CMakeLists.txt                        |  2 +-
 ruby/red-arrow/ext/arrow/extconf.rb          |  2 +-
 11 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 916db2580e3..854d792f310 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -163,7 +163,7 @@ jobs:
       ARROW_BUILD_TESTS: OFF
       PYARROW_TEST_LARGE_MEMORY: ON
       # Current oldest supported version according to https://endoflife.date/macos
-      MACOSX_DEPLOYMENT_TARGET: 10.15
+      MACOSX_DEPLOYMENT_TARGET: 12.0
     steps:
       - name: Checkout Arrow
         uses: actions/checkout@v4
diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh
index d5430f26748..92b962f1740 100755
--- a/ci/scripts/python_wheel_macos_build.sh
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -34,7 +34,7 @@ rm -rf ${source_dir}/python/pyarrow/*.so.*
 
 echo "=== (${PYTHON_VERSION}) Set SDK, C++ and Wheel flags ==="
 export _PYTHON_HOST_PLATFORM="macosx-${MACOSX_DEPLOYMENT_TARGET}-${arch}"
-export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-10.15}
+export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-12.0}
 export SDKROOT=${SDKROOT:-$(xcrun --sdk macosx --show-sdk-path)}
 
 if [ $arch = "arm64" ]; then
diff --git a/ci/vcpkg/arm64-osx-static-debug.cmake b/ci/vcpkg/arm64-osx-static-debug.cmake
index f511819a2ed..32ae7bc4334 100644
--- a/ci/vcpkg/arm64-osx-static-debug.cmake
+++ b/ci/vcpkg/arm64-osx-static-debug.cmake
@@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static)
 
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES arm64)
-set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0")
+set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0")
 
 set(VCPKG_BUILD_TYPE debug)
diff --git a/ci/vcpkg/arm64-osx-static-release.cmake b/ci/vcpkg/arm64-osx-static-release.cmake
index 43d65efb265..dde46cd763a 100644
--- a/ci/vcpkg/arm64-osx-static-release.cmake
+++ b/ci/vcpkg/arm64-osx-static-release.cmake
@@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static)
 
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES arm64)
-set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0")
+set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0")
 
 set(VCPKG_BUILD_TYPE release)
diff --git a/ci/vcpkg/universal2-osx-static-debug.cmake b/ci/vcpkg/universal2-osx-static-debug.cmake
index 8abc1ebf838..d3ef0d67eb7 100644
--- a/ci/vcpkg/universal2-osx-static-debug.cmake
+++ b/ci/vcpkg/universal2-osx-static-debug.cmake
@@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static)
 
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
-set(VCPKG_OSX_DEPLOYMENT_TARGET "10.15")
+set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0")
 
 set(VCPKG_BUILD_TYPE debug)
diff --git a/ci/vcpkg/universal2-osx-static-release.cmake b/ci/vcpkg/universal2-osx-static-release.cmake
index 2eb36c15175..3018aa93e5f 100644
--- a/ci/vcpkg/universal2-osx-static-release.cmake
+++ b/ci/vcpkg/universal2-osx-static-release.cmake
@@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static)
 
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
-set(VCPKG_OSX_DEPLOYMENT_TARGET "10.15")
+set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0")
 
 set(VCPKG_BUILD_TYPE release)
diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt
index 98f93705f6f..835c4fc83bf 100644
--- a/cpp/src/arrow/flight/CMakeLists.txt
+++ b/cpp/src/arrow/flight/CMakeLists.txt
@@ -26,6 +26,12 @@ endif()
 if(WIN32)
   list(APPEND ARROW_FLIGHT_LINK_LIBS ws2_32.lib)
 endif()
+# Updating the MACOSX_DEPLOYMENT_TARGET to 12 required us to explicitly
+# link Flight with OpenSSL on macOS. Read this comment for more details:
+# https://github.com/apache/arrow/pull/43137#pullrequestreview-2267476893
+if(APPLE AND ARROW_USE_OPENSSL)
+  list(APPEND ARROW_FLIGHT_LINK_LIBS ${ARROW_OPENSSL_LIBS})
+endif()
 
 set(ARROW_FLIGHT_TEST_LINKAGE "${ARROW_TEST_LINKAGE}")
 if(Protobuf_USE_STATIC_LIBS)
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index cae34c32313..7f52fe7b052 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -413,7 +413,7 @@ tasks:
 
 {############################## Wheel macOS ####################################}
 
-{% for macos_version, macos_codename in [("10.15", "catalina")] %}
+{% for macos_version, macos_codename in [("12.0", "monterey")] %}
   {% set platform_tag = "macosx_{}_x86_64".format(macos_version.replace('.', '_')) %}
 
   wheel-macos-{{ macos_codename }}-{{ python_tag }}-amd64:
@@ -424,25 +424,25 @@ tasks:
       arrow_jemalloc: "ON"
       python_version: "{{ python_version }}"
       macos_deployment_target: "{{ macos_version }}"
-      runs_on: "macos-13"
+      runs_on: "macos-12"
       vcpkg_arch: "amd64"
     artifacts:
       - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl
 
 {% endfor %}
 
-  wheel-macos-big-sur-{{ python_tag }}-arm64:
+  wheel-macos-monterey-{{ python_tag }}-arm64:
     ci: github
     template: python-wheels/github.osx.yml
     params:
       arch: "arm64"
       arrow_jemalloc: "OFF"
       python_version: "{{ python_version }}"
-      macos_deployment_target: "11.0"
+      macos_deployment_target: "12.0"
       runs_on: "macos-14"
       vcpkg_arch: "arm64"
     artifacts:
-      - pyarrow-{no_rc_version}-{{ python_tag }}-{{ python_tag }}-macosx_11_0_arm64.whl
+      - pyarrow-{no_rc_version}-{{ python_tag }}-{{ python_tag }}-macosx_12_0_arm64.whl
 
 {############################## Wheel Windows ################################}
 
diff --git a/dev/tasks/verify-rc/github.macos.yml b/dev/tasks/verify-rc/github.macos.yml
index 4bc3fff71b6..e2bc7895c6d 100644
--- a/dev/tasks/verify-rc/github.macos.yml
+++ b/dev/tasks/verify-rc/github.macos.yml
@@ -22,7 +22,7 @@
 {% set use_conda = use_conda|default(False) %}
 # env: is generated by macros.github_header()
   # Current oldest supported version according to https://endoflife.date/macos
-  MACOSX_DEPLOYMENT_TARGET: "10.15"
+  MACOSX_DEPLOYMENT_TARGET: "12.0"
 
 jobs:
   verify:
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 5d5eeaf8157..1a18b2b173a 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -84,7 +84,7 @@ set(CMAKE_MACOSX_RPATH 1)
 if(DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
   set(CMAKE_OSX_DEPLOYMENT_TARGET $ENV{MACOSX_DEPLOYMENT_TARGET})
 else()
-  set(CMAKE_OSX_DEPLOYMENT_TARGET 10.15)
+  set(CMAKE_OSX_DEPLOYMENT_TARGET 12.0)
 endif()
 
 # Generate a Clang compile_commands.json "compilation database" file for use
diff --git a/ruby/red-arrow/ext/arrow/extconf.rb b/ruby/red-arrow/ext/arrow/extconf.rb
index 28ccd0b2d59..a3005cd56f2 100644
--- a/ruby/red-arrow/ext/arrow/extconf.rb
+++ b/ruby/red-arrow/ext/arrow/extconf.rb
@@ -91,7 +91,7 @@
   symbols_in_external_bundles.each do |symbol|
     $DLDFLAGS << " -Wl,-U,#{symbol}"
   end
-  mmacosx_version_min = "-mmacosx-version-min=10.15"
+  mmacosx_version_min = "-mmacosx-version-min=12.0"
   $CFLAGS << " #{mmacosx_version_min}"
   $CXXFLAGS << " #{mmacosx_version_min}"
 end

From 45592f9e1d98da75a7bdc534375b32a004f13e02 Mon Sep 17 00:00:00 2001
From: Xin Hao <haoxinst@gmail.com>
Date: Thu, 29 Aug 2024 22:53:54 +0800
Subject: [PATCH 093/157] GH-43732: [Go] Require Go 1.22 or above (#43864)

### Rationale for this change

https://github.com/apache/arrow/issues/43732

### What changes are included in this PR?


### Are these changes tested?

### Are there any user-facing changes?

* GitHub Issue: #43732

Authored-by: Xin Hao <haoxinst@gmail.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
---
 .env                                    |  4 ++--
 .github/workflows/go.yml                | 22 +++++++++++-----------
 ci/docker/conda-integration.dockerfile  |  2 +-
 ci/docker/debian-12-go.dockerfile       |  4 ++--
 dev/release/verify-release-candidate.sh |  8 ++++----
 dev/tasks/tasks.yml                     |  2 +-
 go/arrow/compute/cast_test.go           |  2 +-
 go/arrow/scalar/parse.go                |  2 +-
 go/go.mod                               |  2 +-
 go/parquet/file/file_reader.go          |  2 +-
 go/parquet/schema/reflection.go         |  8 ++++----
 11 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/.env b/.env
index 21f904c3208..af647fc8b7a 100644
--- a/.env
+++ b/.env
@@ -58,8 +58,8 @@ CUDA=11.2.2
 DASK=latest
 DOTNET=8.0
 GCC_VERSION=""
-GO=1.21.8
-STATICCHECK=v0.4.7
+GO=1.22.6
+STATICCHECK=v0.5.1
 HDFS=3.2.1
 JDK=11
 KARTOTHEK=latest
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index ffd543691d5..b9a19d182d5 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -62,13 +62,13 @@ jobs:
           {
             "arch-label": "AMD64",
             "arch": "amd64",
-            "go": "1.21",
+            "go": "1.22",
             "runs-on": "ubuntu-latest"
           },
           {
             "arch-label": "AMD64",
             "arch": "amd64",
-            "go": "1.22",
+            "go": "1.23",
             "runs-on": "ubuntu-latest"
           }
           JSON
@@ -78,13 +78,13 @@ jobs:
           {
             "arch-label": "ARM64",
             "arch": "arm64v8",
-            "go": "1.21",
+            "go": "1.22",
             "runs-on": ["self-hosted", "arm", "linux"]
           },
           {
             "arch-label": "ARM64",
             "arch": "arm64v8",
-            "go": "1.22",
+            "go": "1.23",
             "runs-on": ["self-hosted", "arm", "linux"]
           }
           JSON
@@ -197,7 +197,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        go: ['1.21', '1.22']
+        go: ['1.22', '1.23']
     env:
       GO: ${{ matrix.go }}
     steps:
@@ -238,7 +238,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        go: ['1.21', '1.22']
+        go: ['1.22', '1.23']
     env:
       GO: ${{ matrix.go }}
     steps:
@@ -277,7 +277,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        go: ['1.21', '1.22']
+        go: ['1.22', '1.23']
     steps:
       - name: Checkout Arrow
         uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
@@ -310,7 +310,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        go: ['1.21', '1.22']
+        go: ['1.22', '1.23']
     steps:
       - name: Checkout Arrow
         uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
@@ -323,7 +323,7 @@ jobs:
           go-version: ${{ matrix.go }}
           cache: true
           cache-dependency-path: go/go.sum
-      - name: Install staticcheck      
+      - name: Install staticcheck
         run: |
           . .env
           go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK}
@@ -368,7 +368,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        go: ['1.21', '1.22']
+        go: ['1.22', '1.23']
     env:
       ARROW_GO_TESTCGO: "1"
     steps:
@@ -439,7 +439,7 @@ jobs:
           ci/scripts/msys2_setup.sh cgo
       - name: Get required Go version
         run: |
-          (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV  
+          (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV
       - name: Update CGO Env vars
         shell: msys2 {0}
         run: |
diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile
index c602490d6b7..7ad2e5c0e80 100644
--- a/ci/docker/conda-integration.dockerfile
+++ b/ci/docker/conda-integration.dockerfile
@@ -24,7 +24,7 @@ ARG maven=3.8.7
 ARG node=16
 ARG yarn=1.22
 ARG jdk=11
-ARG go=1.21.8
+ARG go=1.22.6
 
 # Install Archery and integration dependencies
 COPY ci/conda_env_archery.txt /arrow/ci/
diff --git a/ci/docker/debian-12-go.dockerfile b/ci/docker/debian-12-go.dockerfile
index c958e6bdee2..4bc683c109e 100644
--- a/ci/docker/debian-12-go.dockerfile
+++ b/ci/docker/debian-12-go.dockerfile
@@ -16,8 +16,8 @@
 # under the License.
 
 ARG arch=amd64
-ARG go=1.21
-ARG staticcheck=v0.4.7
+ARG go=1.22
+ARG staticcheck=v0.5.1
 FROM ${arch}/golang:${go}-bookworm
 
 # FROM collects all the args, get back the staticcheck version arg
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 07e765a759e..cdea4ca0d00 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -24,7 +24,7 @@
 # - JDK >= 11
 # - gcc >= 4.8
 # - Node.js >= 18
-# - Go >= 1.21
+# - Go >= 1.22
 # - Docker
 #
 # If using a non-system Boost, set BOOST_ROOT and add Boost libraries to
@@ -403,7 +403,7 @@ install_go() {
     return 0
   fi
 
-  local version=1.21.8
+  local version=1.22.6
   show_info "Installing go version ${version}..."
 
   local arch="$(uname -m)"
@@ -512,7 +512,7 @@ install_maven() {
     show_info "System Maven version ${SYSTEM_MAVEN_VERSION} matches required Maven version ${MAVEN_VERSION}. Skipping installation."
   else
     # Append pipe character to make preview release versions like "X.Y.Z-beta-1" sort
-    # as older than their corresponding release version "X.Y.Z". This works because 
+    # as older than their corresponding release version "X.Y.Z". This works because
     # `sort -V` orders the pipe character lower than any version number character.
     older_version=$(printf '%s\n%s\n' "$SYSTEM_MAVEN_VERSION" "$MAVEN_VERSION" | sed 's/$/|/' | sort -V | sed 's/|$//' | head -n1)
     if [[ "$older_version" == "$SYSTEM_MAVEN_VERSION" ]]; then
@@ -953,7 +953,7 @@ test_go() {
   show_header "Build and test Go libraries"
 
   maybe_setup_go
-  maybe_setup_conda compilers go=1.21
+  maybe_setup_conda compilers go=1.22
 
   pushd go
   go get -v ./...
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 7f52fe7b052..c6d2f2175d4 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1475,7 +1475,7 @@ tasks:
         R_PRUNE_DEPS: TRUE
       image: r-clang-sanitizer
 
-  {% for go_version, staticcheck in [("1.21", "v0.4.7"), ("1.22", "latest")] %}
+  {% for go_version, staticcheck in [("1.22", "v0.5.1"), ("1.23", "latest")] %}
   test-debian-12-go-{{ go_version }}:
     ci: github
     template: docker-tests/github.linux.yml
diff --git a/go/arrow/compute/cast_test.go b/go/arrow/compute/cast_test.go
index fa08467dd39..db6098225dd 100644
--- a/go/arrow/compute/cast_test.go
+++ b/go/arrow/compute/cast_test.go
@@ -129,7 +129,7 @@ func checkScalarWithScalars(t *testing.T, funcName string, inputs []scalar.Scala
 			fmt.Fprintf(&b, " (types differed: %s vs %s)",
 				out.(*compute.ScalarDatum).Type(), expected.DataType())
 		}
-		t.Fatalf(b.String())
+		t.Fatal(b.String())
 	}
 }
 
diff --git a/go/arrow/scalar/parse.go b/go/arrow/scalar/parse.go
index 866e627113d..27db42afa69 100644
--- a/go/arrow/scalar/parse.go
+++ b/go/arrow/scalar/parse.go
@@ -329,7 +329,7 @@ func fromListScalar(s ListScalar, v reflect.Value) error {
 		}
 	case *array.Map:
 		// only implementing slice of metadata for now
-		if v.Type().Elem() != reflect.PtrTo(reflect.TypeOf(arrow.Metadata{})) {
+		if v.Type().Elem() != reflect.PointerTo(reflect.TypeOf(arrow.Metadata{})) {
 			return fmt.Errorf("unimplemented fromListScalar type %s to %s", arr.DataType(), v.Type().String())
 		}
 
diff --git a/go/go.mod b/go/go.mod
index a995eee24d5..77f98cefb0f 100644
--- a/go/go.mod
+++ b/go/go.mod
@@ -16,7 +16,7 @@
 
 module github.com/apache/arrow/go/v18
 
-go 1.21
+go 1.22
 
 require (
 	github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c
diff --git a/go/parquet/file/file_reader.go b/go/parquet/file/file_reader.go
index f838482fbb0..f25b882e006 100644
--- a/go/parquet/file/file_reader.go
+++ b/go/parquet/file/file_reader.go
@@ -233,7 +233,7 @@ func (f *Reader) parseMetaData() error {
 func (f *Reader) handleAadPrefix(fileDecrypt *parquet.FileDecryptionProperties, algo *parquet.Algorithm) (string, error) {
 	aadPrefixInProps := fileDecrypt.AadPrefix()
 	aadPrefix := []byte(aadPrefixInProps)
-	fileHasAadPrefix := algo.Aad.AadPrefix != nil && len(algo.Aad.AadPrefix) > 0
+	fileHasAadPrefix := len(algo.Aad.AadPrefix) > 0
 	aadPrefixInFile := algo.Aad.AadPrefix
 
 	if algo.Aad.SupplyAadPrefix && aadPrefixInProps == "" {
diff --git a/go/parquet/schema/reflection.go b/go/parquet/schema/reflection.go
index 0bec9eb599d..51d0a84f224 100644
--- a/go/parquet/schema/reflection.go
+++ b/go/parquet/schema/reflection.go
@@ -639,7 +639,7 @@ func typeFromNode(n Node) reflect.Type {
 		}
 
 		if n.RepetitionType() == parquet.Repetitions.Optional {
-			typ = reflect.PtrTo(typ)
+			typ = reflect.PointerTo(typ)
 		} else if n.RepetitionType() == parquet.Repetitions.Repeated {
 			typ = reflect.SliceOf(typ)
 		}
@@ -707,7 +707,7 @@ func typeFromNode(n Node) reflect.Type {
 				elemType = reflect.SliceOf(elemType)
 			}
 			if gnode.RepetitionType() == parquet.Repetitions.Optional {
-				elemType = reflect.PtrTo(elemType)
+				elemType = reflect.PointerTo(elemType)
 			}
 			return elemType
 		case ConvertedTypes.Map, ConvertedTypes.MapKeyValue:
@@ -778,7 +778,7 @@ func typeFromNode(n Node) reflect.Type {
 
 			mapType := reflect.MapOf(keyType, valType)
 			if gnode.RepetitionType() == parquet.Repetitions.Optional {
-				mapType = reflect.PtrTo(mapType)
+				mapType = reflect.PointerTo(mapType)
 			}
 			return mapType
 		default:
@@ -796,7 +796,7 @@ func typeFromNode(n Node) reflect.Type {
 				return reflect.SliceOf(structType)
 			}
 			if gnode.RepetitionType() == parquet.Repetitions.Optional {
-				return reflect.PtrTo(structType)
+				return reflect.PointerTo(structType)
 			}
 			return structType
 		}

From 4f91c8f144125bd147c25cb49ac0071c8d28765c Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Thu, 29 Aug 2024 23:38:41 +0800
Subject: [PATCH 094/157] GH-43759: [C++] Acero: Minor code enhancement for
 Join (#43760)

### Rationale for this change

Minor style enhancement for join

### What changes are included in this PR?

Minor style enhancement for join

### Are these changes tested?

Covered by existing

### Are there any user-facing changes?

no

* GitHub Issue: #43759

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 cpp/src/arrow/acero/hash_join_dict.cc         |  9 ++-
 cpp/src/arrow/acero/hash_join_node.cc         | 16 ++---
 cpp/src/arrow/acero/hash_join_node.h          |  6 +-
 cpp/src/arrow/acero/swiss_join.cc             |  7 +-
 cpp/src/arrow/compute/light_array_internal.cc | 68 +++++++++----------
 cpp/src/arrow/compute/light_array_internal.h  |  6 +-
 cpp/src/arrow/compute/light_array_test.cc     |  4 +-
 7 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/cpp/src/arrow/acero/hash_join_dict.cc b/cpp/src/arrow/acero/hash_join_dict.cc
index 3aef08e6e9c..8db9dddb2c3 100644
--- a/cpp/src/arrow/acero/hash_join_dict.cc
+++ b/cpp/src/arrow/acero/hash_join_dict.cc
@@ -225,21 +225,20 @@ Status HashJoinDictBuild::Init(ExecContext* ctx, std::shared_ptr<Array> dictiona
     return Status::OK();
   }
 
-  dictionary_ = dictionary;
+  dictionary_ = std::move(dictionary);
 
   // Initialize encoder
   RowEncoder encoder;
-  std::vector<TypeHolder> encoder_types;
-  encoder_types.emplace_back(value_type_);
+  std::vector<TypeHolder> encoder_types{value_type_};
   encoder.Init(encoder_types, ctx);
 
   // Encode all dictionary values
-  int64_t length = dictionary->data()->length;
+  int64_t length = dictionary_->data()->length;
   if (length >= std::numeric_limits<int32_t>::max()) {
     return Status::Invalid(
         "Dictionary length in hash join must fit into signed 32-bit integer.");
   }
-  RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan({*dictionary->data()}, length)));
+  RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan({*dictionary_->data()}, length)));
 
   std::vector<int32_t> entries_to_take;
 
diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc
index 67f902e64be..80dd163ced7 100644
--- a/cpp/src/arrow/acero/hash_join_node.cc
+++ b/cpp/src/arrow/acero/hash_join_node.cc
@@ -61,30 +61,30 @@ Result<std::vector<FieldRef>> HashJoinSchema::ComputePayload(
     const std::vector<FieldRef>& filter, const std::vector<FieldRef>& keys) {
   // payload = (output + filter) - keys, with no duplicates
   std::unordered_set<int> payload_fields;
-  for (auto ref : output) {
+  for (const auto& ref : output) {
     ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema));
     payload_fields.insert(match[0]);
   }
 
-  for (auto ref : filter) {
+  for (const auto& ref : filter) {
     ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema));
     payload_fields.insert(match[0]);
   }
 
-  for (auto ref : keys) {
+  for (const auto& ref : keys) {
     ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema));
     payload_fields.erase(match[0]);
   }
 
   std::vector<FieldRef> payload_refs;
-  for (auto ref : output) {
+  for (const auto& ref : output) {
     ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema));
     if (payload_fields.find(match[0]) != payload_fields.end()) {
       payload_refs.push_back(ref);
       payload_fields.erase(match[0]);
     }
   }
-  for (auto ref : filter) {
+  for (const auto& ref : filter) {
     ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema));
     if (payload_fields.find(match[0]) != payload_fields.end()) {
       payload_refs.push_back(ref);
@@ -198,7 +198,7 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc
     return Status::Invalid("Different number of key fields on left (", left_keys.size(),
                            ") and right (", right_keys.size(), ") side of the join");
   }
-  if (left_keys.size() < 1) {
+  if (left_keys.empty()) {
     return Status::Invalid("Join key cannot be empty");
   }
   for (size_t i = 0; i < left_keys.size() + right_keys.size(); ++i) {
@@ -432,7 +432,7 @@ Status HashJoinSchema::CollectFilterColumns(std::vector<FieldRef>& left_filter,
         indices[0] -= left_schema.num_fields();
         FieldPath corrected_path(std::move(indices));
         if (right_seen_paths.find(*path) == right_seen_paths.end()) {
-          right_filter.push_back(corrected_path);
+          right_filter.emplace_back(corrected_path);
           right_seen_paths.emplace(std::move(corrected_path));
         }
       } else if (left_seen_paths.find(*path) == left_seen_paths.end()) {
@@ -698,7 +698,7 @@ class HashJoinNode : public ExecNode, public TracedNode {
                std::shared_ptr<Schema> output_schema,
                std::unique_ptr<HashJoinSchema> schema_mgr, Expression filter,
                std::unique_ptr<HashJoinImpl> impl)
-      : ExecNode(plan, inputs, {"left", "right"},
+      : ExecNode(plan, std::move(inputs), {"left", "right"},
                  /*output_schema=*/std::move(output_schema)),
         TracedNode(this),
         join_type_(join_options.join_type),
diff --git a/cpp/src/arrow/acero/hash_join_node.h b/cpp/src/arrow/acero/hash_join_node.h
index ad60019ceab..19745b8675c 100644
--- a/cpp/src/arrow/acero/hash_join_node.h
+++ b/cpp/src/arrow/acero/hash_join_node.h
@@ -65,9 +65,9 @@ class ARROW_ACERO_EXPORT HashJoinSchema {
   std::shared_ptr<Schema> MakeOutputSchema(const std::string& left_field_name_suffix,
                                            const std::string& right_field_name_suffix);
 
-  bool LeftPayloadIsEmpty() { return PayloadIsEmpty(0); }
+  bool LeftPayloadIsEmpty() const { return PayloadIsEmpty(0); }
 
-  bool RightPayloadIsEmpty() { return PayloadIsEmpty(1); }
+  bool RightPayloadIsEmpty() const { return PayloadIsEmpty(1); }
 
   static int kMissingField() {
     return SchemaProjectionMaps<HashJoinProjection>::kMissingField;
@@ -88,7 +88,7 @@ class ARROW_ACERO_EXPORT HashJoinSchema {
                                             const SchemaProjectionMap& right_to_filter,
                                             const Expression& filter);
 
-  bool PayloadIsEmpty(int side) {
+  bool PayloadIsEmpty(int side) const {
     assert(side == 0 || side == 1);
     return proj_maps[side].num_cols(HashJoinProjection::PAYLOAD) == 0;
   }
diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc
index 4d0c8187ac6..6c783110af5 100644
--- a/cpp/src/arrow/acero/swiss_join.cc
+++ b/cpp/src/arrow/acero/swiss_join.cc
@@ -1667,7 +1667,7 @@ Result<std::shared_ptr<ArrayData>> JoinResultMaterialize::FlushBuildColumn(
     const std::shared_ptr<DataType>& data_type, const RowArray* row_array, int column_id,
     uint32_t* row_ids) {
   ResizableArrayData output;
-  output.Init(data_type, pool_, bit_util::Log2(num_rows_));
+  RETURN_NOT_OK(output.Init(data_type, pool_, bit_util::Log2(num_rows_)));
 
   for (size_t i = 0; i <= null_ranges_.size(); ++i) {
     int row_id_begin =
@@ -2247,8 +2247,9 @@ Result<ExecBatch> JoinResidualFilter::MaterializeFilterInput(
         build_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::PAYLOAD);
     for (int i = 0; i < num_build_cols; ++i) {
       ResizableArrayData column_data;
-      column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i), pool_,
-                       bit_util::Log2(num_batch_rows));
+      RETURN_NOT_OK(
+          column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i),
+                           pool_, bit_util::Log2(num_batch_rows)));
       if (auto idx = to_key.get(i); idx != SchemaProjectionMap::kMissingField) {
         RETURN_NOT_OK(build_keys_->DecodeSelected(&column_data, idx, num_batch_rows,
                                                   key_ids_maybe_null, pool_));
diff --git a/cpp/src/arrow/compute/light_array_internal.cc b/cpp/src/arrow/compute/light_array_internal.cc
index 4f235925d0f..e4b1f1b8cdd 100644
--- a/cpp/src/arrow/compute/light_array_internal.cc
+++ b/cpp/src/arrow/compute/light_array_internal.cc
@@ -118,10 +118,9 @@ Result<KeyColumnMetadata> ColumnMetadataFromDataType(
     const std::shared_ptr<DataType>& type) {
   const bool is_extension = type->id() == Type::EXTENSION;
   const std::shared_ptr<DataType>& typ =
-      is_extension
-          ? arrow::internal::checked_pointer_cast<ExtensionType>(type->GetSharedPtr())
-                ->storage_type()
-          : type;
+      is_extension ? arrow::internal::checked_cast<const ExtensionType*>(type.get())
+                         ->storage_type()
+                   : type;
 
   if (typ->id() == Type::DICTIONARY) {
     auto bit_width =
@@ -205,22 +204,25 @@ Status ColumnArraysFromExecBatch(const ExecBatch& batch,
                                    column_arrays);
 }
 
-void ResizableArrayData::Init(const std::shared_ptr<DataType>& data_type,
-                              MemoryPool* pool, int log_num_rows_min) {
+Status ResizableArrayData::Init(const std::shared_ptr<DataType>& data_type,
+                                MemoryPool* pool, int log_num_rows_min) {
 #ifndef NDEBUG
   if (num_rows_allocated_ > 0) {
-    ARROW_DCHECK(data_type_ != NULLPTR);
-    KeyColumnMetadata metadata_before =
-        ColumnMetadataFromDataType(data_type_).ValueOrDie();
-    KeyColumnMetadata metadata_after = ColumnMetadataFromDataType(data_type).ValueOrDie();
+    ARROW_DCHECK(data_type_ != nullptr);
+    const KeyColumnMetadata& metadata_before = column_metadata_;
+    ARROW_ASSIGN_OR_RAISE(KeyColumnMetadata metadata_after,
+                          ColumnMetadataFromDataType(data_type));
     ARROW_DCHECK(metadata_before.is_fixed_length == metadata_after.is_fixed_length &&
                  metadata_before.fixed_length == metadata_after.fixed_length);
   }
 #endif
+  ARROW_DCHECK(data_type != nullptr);
+  ARROW_ASSIGN_OR_RAISE(column_metadata_, ColumnMetadataFromDataType(data_type));
   Clear(/*release_buffers=*/false);
   log_num_rows_min_ = log_num_rows_min;
   data_type_ = data_type;
   pool_ = pool;
+  return Status::OK();
 }
 
 void ResizableArrayData::Clear(bool release_buffers) {
@@ -246,8 +248,6 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) {
     num_rows_allocated_new *= 2;
   }
 
-  KeyColumnMetadata column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie();
-
   if (buffers_[kFixedLengthBuffer] == NULLPTR) {
     ARROW_DCHECK(buffers_[kValidityBuffer] == NULLPTR &&
                  buffers_[kVariableLengthBuffer] == NULLPTR);
@@ -258,8 +258,8 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) {
             bit_util::BytesForBits(num_rows_allocated_new) + kNumPaddingBytes, pool_));
     memset(mutable_data(kValidityBuffer), 0,
            bit_util::BytesForBits(num_rows_allocated_new) + kNumPaddingBytes);
-    if (column_metadata.is_fixed_length) {
-      if (column_metadata.fixed_length == 0) {
+    if (column_metadata_.is_fixed_length) {
+      if (column_metadata_.fixed_length == 0) {
         ARROW_ASSIGN_OR_RAISE(
             buffers_[kFixedLengthBuffer],
             AllocateResizableBuffer(
@@ -271,7 +271,7 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) {
         ARROW_ASSIGN_OR_RAISE(
             buffers_[kFixedLengthBuffer],
             AllocateResizableBuffer(
-                num_rows_allocated_new * column_metadata.fixed_length + kNumPaddingBytes,
+                num_rows_allocated_new * column_metadata_.fixed_length + kNumPaddingBytes,
                 pool_));
       }
     } else {
@@ -300,15 +300,15 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) {
     memset(mutable_data(kValidityBuffer) + bytes_for_bits_before, 0,
            bytes_for_bits_after - bytes_for_bits_before);
 
-    if (column_metadata.is_fixed_length) {
-      if (column_metadata.fixed_length == 0) {
+    if (column_metadata_.is_fixed_length) {
+      if (column_metadata_.fixed_length == 0) {
         RETURN_NOT_OK(buffers_[kFixedLengthBuffer]->Resize(
             bit_util::BytesForBits(num_rows_allocated_new) + kNumPaddingBytes));
         memset(mutable_data(kFixedLengthBuffer) + bytes_for_bits_before, 0,
                bytes_for_bits_after - bytes_for_bits_before);
       } else {
         RETURN_NOT_OK(buffers_[kFixedLengthBuffer]->Resize(
-            num_rows_allocated_new * column_metadata.fixed_length + kNumPaddingBytes));
+            num_rows_allocated_new * column_metadata_.fixed_length + kNumPaddingBytes));
       }
     } else {
       RETURN_NOT_OK(buffers_[kFixedLengthBuffer]->Resize(
@@ -323,10 +323,7 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) {
 }
 
 Status ResizableArrayData::ResizeVaryingLengthBuffer() {
-  KeyColumnMetadata column_metadata;
-  column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie();
-
-  if (!column_metadata.is_fixed_length) {
+  if (!column_metadata_.is_fixed_length) {
     int64_t min_new_size = buffers_[kFixedLengthBuffer]->data_as<int32_t>()[num_rows_];
     ARROW_DCHECK(var_len_buf_size_ > 0);
     if (var_len_buf_size_ < min_new_size) {
@@ -343,23 +340,19 @@ Status ResizableArrayData::ResizeVaryingLengthBuffer() {
 }
 
 KeyColumnArray ResizableArrayData::column_array() const {
-  KeyColumnMetadata column_metadata;
-  column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie();
-  return KeyColumnArray(column_metadata, num_rows_,
+  return KeyColumnArray(column_metadata_, num_rows_,
                         buffers_[kValidityBuffer]->mutable_data(),
                         buffers_[kFixedLengthBuffer]->mutable_data(),
                         buffers_[kVariableLengthBuffer]->mutable_data());
 }
 
 std::shared_ptr<ArrayData> ResizableArrayData::array_data() const {
-  KeyColumnMetadata column_metadata;
-  column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie();
-
-  auto valid_count = arrow::internal::CountSetBits(
-      buffers_[kValidityBuffer]->data(), /*offset=*/0, static_cast<int64_t>(num_rows_));
+  auto valid_count =
+      arrow::internal::CountSetBits(buffers_[kValidityBuffer]->data(), /*bit_offset=*/0,
+                                    static_cast<int64_t>(num_rows_));
   int null_count = static_cast<int>(num_rows_) - static_cast<int>(valid_count);
 
-  if (column_metadata.is_fixed_length) {
+  if (column_metadata_.is_fixed_length) {
     return ArrayData::Make(data_type_, num_rows_,
                            {buffers_[kValidityBuffer], buffers_[kFixedLengthBuffer]},
                            null_count);
@@ -493,10 +486,12 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr<ArrayData>& source
   ARROW_DCHECK(num_rows_before >= 0);
   int num_rows_after = num_rows_before + num_rows_to_append;
   if (target->num_rows() == 0) {
-    target->Init(source->type, pool, kLogNumRows);
+    RETURN_NOT_OK(target->Init(source->type, pool, kLogNumRows));
   }
   RETURN_NOT_OK(target->ResizeFixedLengthBuffers(num_rows_after));
 
+  // Since target->Init is called before, we can assume that the ColumnMetadata
+  // would never fail to be created
   KeyColumnMetadata column_metadata =
       ColumnMetadataFromDataType(source->type).ValueOrDie();
 
@@ -647,11 +642,12 @@ Status ExecBatchBuilder::AppendNulls(const std::shared_ptr<DataType>& type,
   int num_rows_before = target.num_rows();
   int num_rows_after = num_rows_before + num_rows_to_append;
   if (target.num_rows() == 0) {
-    target.Init(type, pool, kLogNumRows);
+    RETURN_NOT_OK(target.Init(type, pool, kLogNumRows));
   }
   RETURN_NOT_OK(target.ResizeFixedLengthBuffers(num_rows_after));
 
-  KeyColumnMetadata column_metadata = ColumnMetadataFromDataType(type).ValueOrDie();
+  ARROW_ASSIGN_OR_RAISE(KeyColumnMetadata column_metadata,
+                        ColumnMetadataFromDataType(type));
 
   // Process fixed length buffer
   //
@@ -708,7 +704,7 @@ Status ExecBatchBuilder::AppendSelected(MemoryPool* pool, const ExecBatch& batch
       const Datum& data = batch.values[col_ids ? col_ids[i] : i];
       ARROW_DCHECK(data.is_array());
       const std::shared_ptr<ArrayData>& array_data = data.array();
-      values_[i].Init(array_data->type, pool, kLogNumRows);
+      RETURN_NOT_OK(values_[i].Init(array_data->type, pool, kLogNumRows));
     }
   }
 
@@ -739,7 +735,7 @@ Status ExecBatchBuilder::AppendNulls(MemoryPool* pool,
   if (values_.empty()) {
     values_.resize(types.size());
     for (size_t i = 0; i < types.size(); ++i) {
-      values_[i].Init(types[i], pool, kLogNumRows);
+      RETURN_NOT_OK(values_[i].Init(types[i], pool, kLogNumRows));
     }
   }
 
diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h
index 995c4211998..b8e48f096ba 100644
--- a/cpp/src/arrow/compute/light_array_internal.h
+++ b/cpp/src/arrow/compute/light_array_internal.h
@@ -295,8 +295,8 @@ class ARROW_EXPORT ResizableArrayData {
   /// \param pool The pool to make allocations on
   /// \param log_num_rows_min All resize operations will allocate at least enough
   ///                         space for (1 << log_num_rows_min) rows
-  void Init(const std::shared_ptr<DataType>& data_type, MemoryPool* pool,
-            int log_num_rows_min);
+  Status Init(const std::shared_ptr<DataType>& data_type, MemoryPool* pool,
+              int log_num_rows_min);
 
   /// \brief Resets the array back to an empty state
   /// \param release_buffers If true then allocated memory is released and the
@@ -351,6 +351,8 @@ class ARROW_EXPORT ResizableArrayData {
   static constexpr int64_t kNumPaddingBytes = 64;
   int log_num_rows_min_;
   std::shared_ptr<DataType> data_type_;
+  // Would be valid if data_type_ != NULLPTR.
+  KeyColumnMetadata column_metadata_{};
   MemoryPool* pool_;
   int num_rows_;
   int num_rows_allocated_;
diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc
index cc02d489d13..98a1ab8b7ac 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -295,7 +295,7 @@ TEST(ResizableArrayData, Basic) {
         arrow::internal::checked_pointer_cast<FixedWidthType>(type)->bit_width() / 8;
     {
       ResizableArrayData array;
-      array.Init(type, pool.get(), /*log_num_rows_min=*/16);
+      ASSERT_OK(array.Init(type, pool.get(), /*log_num_rows_min=*/16));
       ASSERT_EQ(0, array.num_rows());
       ASSERT_OK(array.ResizeFixedLengthBuffers(2));
       ASSERT_EQ(2, array.num_rows());
@@ -330,7 +330,7 @@ TEST(ResizableArrayData, Binary) {
     ARROW_SCOPED_TRACE("Type: ", type->ToString());
     {
       ResizableArrayData array;
-      array.Init(type, pool.get(), /*log_num_rows_min=*/4);
+      ASSERT_OK(array.Init(type, pool.get(), /*log_num_rows_min=*/4));
       ASSERT_EQ(0, array.num_rows());
       ASSERT_OK(array.ResizeFixedLengthBuffers(2));
       ASSERT_EQ(2, array.num_rows());

From 6b242538cf5723da5735814af9a18d0a9b41d5a4 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <pitrou@free.fr>
Date: Thu, 29 Aug 2024 21:14:39 +0200
Subject: [PATCH 095/157] GH-43885: [C++][CI] Catch potential integer overflow
 in PoolBuffer (#43886)

This should fix https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=71200

* GitHub Issue: #43885

Lead-authored-by: Antoine Pitrou <antoine@python.org>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/memory_pool.cc | 11 +++++++++--
 testing                      |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc
index 1e855311a98..34207781277 100644
--- a/cpp/src/arrow/memory_pool.cc
+++ b/cpp/src/arrow/memory_pool.cc
@@ -858,7 +858,7 @@ class PoolBuffer final : public ResizableBuffer {
     }
     uint8_t* ptr = mutable_data();
     if (!ptr || capacity > capacity_) {
-      int64_t new_capacity = bit_util::RoundUpToMultipleOf64(capacity);
+      ARROW_ASSIGN_OR_RAISE(int64_t new_capacity, RoundCapacity(capacity));
       if (ptr) {
         RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, alignment_, &ptr));
       } else {
@@ -878,7 +878,7 @@ class PoolBuffer final : public ResizableBuffer {
     if (ptr && shrink_to_fit && new_size <= size_) {
       // Buffer is non-null and is not growing, so shrink to the requested size without
       // excess space.
-      int64_t new_capacity = bit_util::RoundUpToMultipleOf64(new_size);
+      ARROW_ASSIGN_OR_RAISE(int64_t new_capacity, RoundCapacity(new_size));
       if (capacity_ != new_capacity) {
         // Buffer hasn't got yet the requested size.
         RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, alignment_, &ptr));
@@ -916,6 +916,13 @@ class PoolBuffer final : public ResizableBuffer {
   }
 
  private:
+  static Result<int64_t> RoundCapacity(int64_t capacity) {
+    if (capacity > std::numeric_limits<int64_t>::max() - 63) {
+      return Status::OutOfMemory("capacity too large");
+    }
+    return bit_util::RoundUpToMultipleOf64(capacity);
+  }
+
   MemoryPool* pool_;
   int64_t alignment_;
 };
diff --git a/testing b/testing
index 735ae7128d5..4d209492d51 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 735ae7128d571398dd798d7ff004adebeb342883
+Subproject commit 4d209492d514c2d3cb2d392681b9aa00e6d8da1c

From 07420b0c56066326bd409e9537ee3d43ab6b1a51 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Fri, 30 Aug 2024 07:50:53 +0530
Subject: [PATCH 096/157] GH-43869: [Java][CI] Flight related failure in the
 AMD64 Windows Server 2022 Java JDK 11 CI (#43850)

### Rationale for this change

CIs have been consistently failing on windows recently due to an issue with derby configuration. This PR investigates a solution for this.

### What changes are included in this PR?

Changing the flow of the exception handling and state return.

### Are these changes tested?

Via existing test cases.

### Are there any user-facing changes?

No
* GitHub Issue: #43869

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../flight/sql/example/FlightSqlExample.java  | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
index e7127faf975..67bfc85c486 100644
--- a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
+++ b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
@@ -181,9 +181,8 @@ public static void main(String[] args) throws Exception {
 
   public FlightSqlExample(final Location location, final String dbName) {
     // TODO Constructor should not be doing work.
-    checkState(
-        removeDerbyDatabaseIfExists(dbName) && populateDerbyDatabase(dbName),
-        "Failed to reset Derby database!");
+    checkState(removeDerbyDatabaseIfExists(dbName), "Failed to clear Derby database!");
+    checkState(populateDerbyDatabase(dbName), "Failed to populate Derby database!");
     databaseUri = "jdbc:derby:target/" + dbName;
     final ConnectionFactory connectionFactory =
         new DriverManagerConnectionFactory(databaseUri, new Properties());
@@ -253,36 +252,35 @@ public FlightSqlExample(final Location location, final String dbName) {
   }
 
   public static boolean removeDerbyDatabaseIfExists(final String dbName) {
-    boolean wasSuccess;
     final Path path = Paths.get("target" + File.separator + dbName);
 
     try (final Stream<Path> walk = Files.walk(path)) {
       /*
        * Iterate over all paths to delete, mapping each path to the outcome of its own
-       * deletion as a boolean representing whether or not each individual operation was
-       * successful; then reduce all booleans into a single answer, and store that into
-       * `wasSuccess`, which will later be returned by this method.
+       * deletion as a boolean representing whether each individual operation was
+       * successful; then reduce all booleans into a single answer.
        * If for whatever reason the resulting `Stream<Boolean>` is empty, throw an `IOException`;
        * this not expected.
        */
-      wasSuccess =
+      boolean unused =
           walk.sorted(Comparator.reverseOrder())
               .map(Path::toFile)
               .map(File::delete)
               .reduce(Boolean::logicalAnd)
               .orElseThrow(IOException::new);
-    } catch (IOException e) {
+    } catch (NoSuchFileException e) {
       /*
        * The only acceptable scenario for an `IOException` to be thrown here is if
        * an attempt to delete an non-existing file takes place -- which should be
        * alright, since they would be deleted anyway.
        */
-      if (!(wasSuccess = e instanceof NoSuchFileException)) {
-        LOGGER.error(format("Failed attempt to clear DerbyDB: <%s>", e.getMessage()), e);
-      }
+      LOGGER.error(format("No existing Derby database to delete.: <%s>", e.getMessage()), e);
+      return true;
+    } catch (Exception e) {
+      LOGGER.error(format("Failed attempt to clear DerbyDB.: <%s>", e.getMessage()), e);
+      return false;
     }
-
-    return wasSuccess;
+    return true;
   }
 
   private static boolean populateDerbyDatabase(final String dbName) {

From 63b34c97c5d3ca6d20dacb9e92b404986f1d7d62 Mon Sep 17 00:00:00 2001
From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Date: Fri, 30 Aug 2024 13:00:50 -0400
Subject: [PATCH 097/157] GH-43837: [Go][IPC] Consolidate StreamWriter and
 FileWriter, ensuring that EOS indicator is written in file (#43890)

### Rationale for this change

Fixes: #43837

Much of the logic between the ipc stream writer and the file writer was split. This PR changes the file writer so that it uses a stream writer internally, ensuring that a valid stream is embedded within the file.

**TODO**
- [x] Remove @ bkietz's commits

### What changes are included in this PR?

- Refactor `fileWriter` to embed `streamWriter` and defer relevant methods
- Add test

### Are these changes tested?

Yes

### Are there any user-facing changes?

Go-generated IPC files will contain the EOS indicator

* GitHub Issue: #43837

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Joel Lubinitsky <joellubi@gmail.com>
---
 go/arrow/ipc/file_test.go   | 40 ++++++++++++++++++
 go/arrow/ipc/file_writer.go | 82 +++++++++----------------------------
 go/arrow/ipc/writer.go      | 12 +++---
 3 files changed, 65 insertions(+), 69 deletions(-)

diff --git a/go/arrow/ipc/file_test.go b/go/arrow/ipc/file_test.go
index dea63579cfe..b9a4547a512 100644
--- a/go/arrow/ipc/file_test.go
+++ b/go/arrow/ipc/file_test.go
@@ -17,13 +17,17 @@
 package ipc_test
 
 import (
+	"bytes"
 	"fmt"
 	"os"
 	"testing"
 
+	"github.com/apache/arrow/go/v18/arrow/array"
 	"github.com/apache/arrow/go/v18/arrow/internal/arrdata"
 	"github.com/apache/arrow/go/v18/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v18/arrow/ipc"
 	"github.com/apache/arrow/go/v18/arrow/memory"
+	"github.com/stretchr/testify/require"
 )
 
 func TestFile(t *testing.T) {
@@ -75,3 +79,39 @@ func TestFileCompressed(t *testing.T) {
 		}
 	}
 }
+
+func TestFileEmbedsStream(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	recs := arrdata.Records["primitives"]
+	schema := recs[0].Schema()
+
+	var buf bytes.Buffer
+	w, err := ipc.NewFileWriter(&buf, ipc.WithSchema(schema), ipc.WithAllocator(mem))
+	require.NoError(t, err)
+	defer w.Close()
+
+	for _, rec := range recs {
+		require.NoError(t, w.Write(rec))
+	}
+
+	require.NoError(t, w.Close())
+
+	// we should be able to read a valid ipc stream within the ipc file
+
+	// create an ipc stream reader, skipping the file magic+padding bytes
+	rdr, err := ipc.NewReader(bytes.NewReader(buf.Bytes()[8:]), ipc.WithSchema(schema), ipc.WithAllocator(mem))
+	require.NoError(t, err)
+	defer rdr.Release()
+
+	// the stream reader should know to stop before the footer if the EOS indicator is properly written
+	var i int
+	for rdr.Next() {
+		rec := rdr.Record()
+		require.Truef(t, array.RecordEqual(rec, recs[i]), "records[%d] differ", i)
+		i++
+	}
+
+	require.NoError(t, rdr.Err())
+}
diff --git a/go/arrow/ipc/file_writer.go b/go/arrow/ipc/file_writer.go
index 8582c81baf2..9a3d7d3dbeb 100644
--- a/go/arrow/ipc/file_writer.go
+++ b/go/arrow/ipc/file_writer.go
@@ -37,23 +37,17 @@ type PayloadWriter interface {
 	Close() error
 }
 
-type pwriter struct {
-	w   io.WriteSeeker
-	pos int64
+type fileWriter struct {
+	streamWriter
 
 	schema *arrow.Schema
 	dicts  []fileBlock
 	recs   []fileBlock
 }
 
-func (w *pwriter) Start() error {
+func (w *fileWriter) Start() error {
 	var err error
 
-	err = w.updatePos()
-	if err != nil {
-		return fmt.Errorf("arrow/ipc: could not update position while in start: %w", err)
-	}
-
 	// only necessary to align to 8-byte boundary at the start of the file
 	_, err = w.Write(Magic)
 	if err != nil {
@@ -65,10 +59,10 @@ func (w *pwriter) Start() error {
 		return fmt.Errorf("arrow/ipc: could not align start block: %w", err)
 	}
 
-	return err
+	return w.streamWriter.Start()
 }
 
-func (w *pwriter) WritePayload(p Payload) error {
+func (w *fileWriter) WritePayload(p Payload) error {
 	blk := fileBlock{Offset: w.pos, Meta: 0, Body: p.size}
 	n, err := writeIPCPayload(w, p)
 	if err != nil {
@@ -77,11 +71,6 @@ func (w *pwriter) WritePayload(p Payload) error {
 
 	blk.Meta = int32(n)
 
-	err = w.updatePos()
-	if err != nil {
-		return fmt.Errorf("arrow/ipc: could not update position while in write-payload: %w", err)
-	}
-
 	switch flatbuf.MessageHeader(p.msg) {
 	case flatbuf.MessageHeaderDictionaryBatch:
 		w.dicts = append(w.dicts, blk)
@@ -92,27 +81,18 @@ func (w *pwriter) WritePayload(p Payload) error {
 	return nil
 }
 
-func (w *pwriter) Close() error {
+func (w *fileWriter) Close() error {
 	var err error
 
-	// write file footer
-	err = w.updatePos()
-	if err != nil {
-		return fmt.Errorf("arrow/ipc: could not update position while in close: %w", err)
+	if err = w.streamWriter.Close(); err != nil {
+		return err
 	}
 
 	pos := w.pos
-	err = writeFileFooter(w.schema, w.dicts, w.recs, w)
-	if err != nil {
+	if err = writeFileFooter(w.schema, w.dicts, w.recs, w); err != nil {
 		return fmt.Errorf("arrow/ipc: could not write file footer: %w", err)
 	}
 
-	// write file footer length
-	err = w.updatePos() // not strictly needed as we passed w to writeFileFooter...
-	if err != nil {
-		return fmt.Errorf("arrow/ipc: could not compute file footer length: %w", err)
-	}
-
 	size := w.pos - pos
 	if size <= 0 {
 		return fmt.Errorf("arrow/ipc: invalid file footer size (size=%d)", size)
@@ -133,13 +113,7 @@ func (w *pwriter) Close() error {
 	return nil
 }
 
-func (w *pwriter) updatePos() error {
-	var err error
-	w.pos, err = w.w.Seek(0, io.SeekCurrent)
-	return err
-}
-
-func (w *pwriter) align(align int32) error {
+func (w *fileWriter) align(align int32) error {
 	remainder := paddedLength(w.pos, align) - w.pos
 	if remainder == 0 {
 		return nil
@@ -149,12 +123,6 @@ func (w *pwriter) align(align int32) error {
 	return err
 }
 
-func (w *pwriter) Write(p []byte) (int, error) {
-	n, err := w.w.Write(p)
-	w.pos += int64(n)
-	return n, err
-}
-
 func writeIPCPayload(w io.Writer, p Payload) (int, error) {
 	n, err := writeMessage(p.meta, kArrowIPCAlignment, w)
 	if err != nil {
@@ -259,18 +227,12 @@ func (ps payloads) Release() {
 
 // FileWriter is an Arrow file writer.
 type FileWriter struct {
-	w io.WriteSeeker
+	w io.Writer
 
 	mem memory.Allocator
 
-	header struct {
-		started bool
-		offset  int64
-	}
-
-	footer struct {
-		written bool
-	}
+	headerStarted bool
+	footerWritten bool
 
 	pw PayloadWriter
 
@@ -289,7 +251,7 @@ type FileWriter struct {
 }
 
 // NewFileWriter opens an Arrow file using the provided writer w.
-func NewFileWriter(w io.WriteSeeker, opts ...Option) (*FileWriter, error) {
+func NewFileWriter(w io.Writer, opts ...Option) (*FileWriter, error) {
 	var (
 		cfg = newConfig(opts...)
 		err error
@@ -297,7 +259,7 @@ func NewFileWriter(w io.WriteSeeker, opts ...Option) (*FileWriter, error) {
 
 	f := FileWriter{
 		w:               w,
-		pw:              &pwriter{w: w, schema: cfg.schema, pos: -1},
+		pw:              &fileWriter{streamWriter: streamWriter{w: w}, schema: cfg.schema},
 		mem:             cfg.alloc,
 		schema:          cfg.schema,
 		codec:           cfg.codec,
@@ -306,12 +268,6 @@ func NewFileWriter(w io.WriteSeeker, opts ...Option) (*FileWriter, error) {
 		compressors:     make([]compressor, cfg.compressNP),
 	}
 
-	pos, err := f.w.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return nil, fmt.Errorf("arrow/ipc: could not seek current position: %w", err)
-	}
-	f.header.offset = pos
-
 	return &f, err
 }
 
@@ -321,7 +277,7 @@ func (f *FileWriter) Close() error {
 		return fmt.Errorf("arrow/ipc: could not write empty file: %w", err)
 	}
 
-	if f.footer.written {
+	if f.footerWritten {
 		return nil
 	}
 
@@ -329,7 +285,7 @@ func (f *FileWriter) Close() error {
 	if err != nil {
 		return fmt.Errorf("arrow/ipc: could not close payload writer: %w", err)
 	}
-	f.footer.written = true
+	f.footerWritten = true
 
 	return nil
 }
@@ -367,14 +323,14 @@ func (f *FileWriter) Write(rec arrow.Record) error {
 }
 
 func (f *FileWriter) checkStarted() error {
-	if !f.header.started {
+	if !f.headerStarted {
 		return f.start()
 	}
 	return nil
 }
 
 func (f *FileWriter) start() error {
-	f.header.started = true
+	f.headerStarted = true
 	err := f.pw.Start()
 	if err != nil {
 		return err
diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go
index 02c67635bb2..5a280fbf84a 100644
--- a/go/arrow/ipc/writer.go
+++ b/go/arrow/ipc/writer.go
@@ -37,18 +37,18 @@ import (
 	"github.com/apache/arrow/go/v18/internal/utils"
 )
 
-type swriter struct {
+type streamWriter struct {
 	w   io.Writer
 	pos int64
 }
 
-func (w *swriter) Start() error { return nil }
-func (w *swriter) Close() error {
+func (w *streamWriter) Start() error { return nil }
+func (w *streamWriter) Close() error {
 	_, err := w.Write(kEOS[:])
 	return err
 }
 
-func (w *swriter) WritePayload(p Payload) error {
+func (w *streamWriter) WritePayload(p Payload) error {
 	_, err := writeIPCPayload(w, p)
 	if err != nil {
 		return err
@@ -56,7 +56,7 @@ func (w *swriter) WritePayload(p Payload) error {
 	return nil
 }
 
-func (w *swriter) Write(p []byte) (int, error) {
+func (w *streamWriter) Write(p []byte) (int, error) {
 	n, err := w.w.Write(p)
 	w.pos += int64(n)
 	return n, err
@@ -118,7 +118,7 @@ func NewWriter(w io.Writer, opts ...Option) *Writer {
 	return &Writer{
 		w:              w,
 		mem:            cfg.alloc,
-		pw:             &swriter{w: w},
+		pw:             &streamWriter{w: w},
 		schema:         cfg.schema,
 		codec:          cfg.codec,
 		emitDictDeltas: cfg.emitDictDeltas,

From 3b310bbf5cc6fb55052dd28107235ca4c734cacf Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 05:31:18 +0900
Subject: [PATCH 098/157] MINOR: [JS] Bump @swc/helpers from 0.5.11 to 0.5.12
 in /js (#43901)

Bumps [@ swc/helpers](https://github.com/swc-project/swc) from 0.5.11 to 0.5.12.
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a href="https://github.com/swc-project/swc/commits">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ swc/helpers&package-manager=npm_and_yarn&previous-version=0.5.11&new-version=0.5.12)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 js/yarn.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/yarn.lock b/js/yarn.lock
index dc1fc99a0ec..b4e208b4a61 100644
--- a/js/yarn.lock
+++ b/js/yarn.lock
@@ -1201,9 +1201,9 @@
   integrity sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ==
 
 "@swc/helpers@^0.5.11":
-  version "0.5.11"
-  resolved "https://registry.yarnpkg.com/@swc/helpers/-/helpers-0.5.11.tgz#5bab8c660a6e23c13b2d23fcd1ee44a2db1b0cb7"
-  integrity sha512-YNlnKRWF2sVojTpIyzwou9XoTNbzbzONwRhOoniEioF1AtaitTvVZblaQRrAzChWQ1bLYyYSWzM18y4WwgzJ+A==
+  version "0.5.12"
+  resolved "https://registry.yarnpkg.com/@swc/helpers/-/helpers-0.5.12.tgz#37aaca95284019eb5d2207101249435659709f4b"
+  integrity sha512-KMZNXiGibsW9kvZAO1Pam2JPTDBm+KSHMMHWdsyI/1DbIZjT2A6Gy3hblVXUMEDvUAKq+e0vL0X0o54owWji7g==
   dependencies:
     tslib "^2.4.0"
 

From 1ecfb31b7dcebc486f404bc0ed74c1cf644bb51b Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Sun, 1 Sep 2024 19:17:42 -0400
Subject: [PATCH 099/157] GH-43665: [R] Remove references to bindings vignette
 (#43889)

### Rationale for this change

The writing-bindings vignette was removed in
https://github.com/apache/arrow/pull/41576#issuecomment-2134327019. It
turns out there were more references to it throughout the docs that I
failed to remove

### What changes are included in this PR?

Deleting x-refs that don't exist anymore.

### Are these changes tested?

Not really

### Are there any user-facing changes?

The docs won't point you at links that 404.
* GitHub Issue: #43665
---
 docs/source/developers/guide/resources.rst    |  1 -
 .../guide/step_by_step/arrow_codebase.rst     |  3 --
 .../developers/guide/tutorials/r_tutorial.rst | 28 -------------------
 r/vignettes/developing.Rmd                    |  3 --
 4 files changed, 35 deletions(-)

diff --git a/docs/source/developers/guide/resources.rst b/docs/source/developers/guide/resources.rst
index b5905af6549..5b598ab1296 100644
--- a/docs/source/developers/guide/resources.rst
+++ b/docs/source/developers/guide/resources.rst
@@ -71,7 +71,6 @@ Contributing
 
 - :ref:`contributing`
 - `Arrow R Developer Guide <https://arrow.apache.org/docs/r/articles/developing.html>`_
-- `Writing Bindings article for R package <https://arrow.apache.org/docs/r/articles/developers/bindings.html>`_.
 
 Reproducible examples:
 
diff --git a/docs/source/developers/guide/step_by_step/arrow_codebase.rst b/docs/source/developers/guide/step_by_step/arrow_codebase.rst
index 0c194ab3a3f..c4ea61d89ff 100644
--- a/docs/source/developers/guide/step_by_step/arrow_codebase.rst
+++ b/docs/source/developers/guide/step_by_step/arrow_codebase.rst
@@ -150,6 +150,3 @@ C++ we must create the binding manually to use it in that implementation.
       When writing bindings between C++ compute functions and R functions,
       the aim is to expose the C++ functionality via the same interface as
       existing R functions.
-
-      To read the full content on the topic of R bindings read through the
-      `Writing Bindings article <https://arrow.apache.org/docs/r/articles/developers/bindings.html>`_.
diff --git a/docs/source/developers/guide/tutorials/r_tutorial.rst b/docs/source/developers/guide/tutorials/r_tutorial.rst
index 62d5cfcbc76..3fba873bff0 100644
--- a/docs/source/developers/guide/tutorials/r_tutorial.rst
+++ b/docs/source/developers/guide/tutorials/r_tutorial.rst
@@ -27,22 +27,6 @@ R tutorials
 ***********
 
 
-Writing Bindings Walkthrough
-============================
-
-The first R package tutorial to be included in the New Contributor's
-guide is a **Walkthrough** added in the **Writing Bindings**
-vignette. With time we will try to include additional tutorials
-directly into this guide.
-
-This tutorial will show how to do a binding of a C++ function
-`starts_with() <https://arrow.apache.org/docs/cpp/compute.html#containment-tests>`_
-to the (base) R function ``startsWith()``.
-
-To view the tutorial follow the
-`Walkthrough section of the Writing Bindings article <https://arrow.apache.org/docs/r/articles/developers/bindings.html#walkthrough>`_.
-
-
 R tutorial on adding a lubridate binding
 ========================================
 
@@ -56,11 +40,6 @@ The binding will be added to the ``expression.R`` file in the
 R package. But you can also follow these steps in case you are
 adding a binding that will live somewhere else.
 
-.. seealso::
-
-   To read more about the philosophy behind R bindings, refer to the
-   `Writing Bindings article <https://arrow.apache.org/docs/r/articles/developers/bindings.html>`_.
-
 This tutorial is different from the :ref:`step_by_step` as we
 will be working on a specific case. This tutorial is not meant
 as a step-by-step guide.
@@ -170,13 +149,6 @@ equivalent data types. lubridate's ``mday()`` function has no additional
 arguments and there are also no option classes associated with Arrow C++
 function ``day()``.
 
-.. note::
-
-   To see what to do if there is an option class associated with the
-   function you are binding, refer to
-   `Examining the C++ function <https://arrow.apache.org/docs/r/articles/developers/bindings.html#examining-the-c-function>`_ from the Writing Bindings
-   article.
-
 Looking at the code in ``expressions.R`` we can see the day function
 is already specified/mapped on the R package side:
 `<https://github.com/apache/arrow/blob/658bec37aa5cbdd53b5e4cdc81b8ba3962e67f11/r/R/expression.R#L63-L64>`_
diff --git a/r/vignettes/developing.Rmd b/r/vignettes/developing.Rmd
index 248a80292a0..147f9cc028d 100644
--- a/r/vignettes/developing.Rmd
+++ b/r/vignettes/developing.Rmd
@@ -52,6 +52,3 @@ There are a number of ways in which we do this:
 * [Running R with the C++ debugger attached](https://arrow.apache.org/docs/r/articles/developers/debugging.html)
 * [In-depth guide to how the package installation works](https://arrow.apache.org/docs/r/articles/developers/install_details.html)
 * [Using Docker to diagnose a bug or test a feature on a specific OS](https://arrow.apache.org/docs/r/articles/developers/docker.html)
-* [Writing bindings between R functions and Arrow Acero functions](https://arrow.apache.org/docs/r/articles/developers/bindings.html)
-
-

From f919da13ec0250ecf9ddf8f57dbd17b22830fa21 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 09:21:49 +0900
Subject: [PATCH 100/157] MINOR: [JS] Bump ix from 6.0.0 to 7.0.0 in /js
 (#43898)

Bumps [ix](https://github.com/ReactiveX/IxJS) from 6.0.0 to 7.0.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/ReactiveX/IxJS/releases">ix's releases</a>.</em></p>
<blockquote>
<h2>v7.0.0</h2>
<h3>Bug Fixes</h3>
<ul>
<li><strong>actions:</strong> fix docs workflow (<a href="https://github.com/ReactiveX/IxJS/commit/512e370f428a970f1f55dbf3a0dcd528e3bbc0ba">512e370</a>)</li>
<li><strong>changelog:</strong> include latest changelog in npm packages (<a href="https://github.com/ReactiveX/IxJS/commit/f25687b46a547b510184ff6558884628e37063b5">f25687b</a>)</li>
</ul>
<h3>chore</h3>
<ul>
<li><strong>build:</strong> fix gulp async task completion (<a href="https://github.com/ReactiveX/IxJS/commit/c68e97c91d565baf33de989d3f6190a8b5046adc">c68e97c</a>)</li>
<li><strong>release:</strong> 7.0.0 (<a href="https://github.com/ReactiveX/IxJS/commit/b8890f1010347ac2242398801d1405e51a4d3396">b8890f1</a>)</li>
</ul>
<h3>Documentation</h3>
<ul>
<li>
<p><strong>CHANGELOG:</strong> 7.0.0 (<a href="https://github.com/ReactiveX/IxJS/commit/28a24f945dc2f5ba05669399713b6e2299ebb28c">28a24f9</a>)</p>
</li>
<li>
<p>Fix exports (<a href="https://redirect.github.com/ReactiveX/IxJS/issues/371">#371</a>) (<a href="https://github.com/ReactiveX/IxJS/commit/d461eae02cc63bafc4e3256d5d59541d1ff7e43f">d461eae</a>), closes <a href="https://redirect.github.com/ReactiveX/IxJS/issues/371">#371</a></p>
</li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/ReactiveX/IxJS/blob/master/CHANGELOG.md">ix's changelog</a>.</em></p>
<blockquote>
<h1><a href="https://github.com/ReactiveX/IxJS/compare/v6.0.0...v7.0.0">7.0.0</a> (2024-07-10)</h1>
<h3>Bug Fixes</h3>
<ul>
<li><strong>actions:</strong> fix docs workflow (<a href="https://github.com/ReactiveX/IxJS/commit/512e370f428a970f1f55dbf3a0dcd528e3bbc0ba">512e370</a>)</li>
<li><strong>changelog:</strong> include latest changelog in npm packages (<a href="https://github.com/ReactiveX/IxJS/commit/f25687b46a547b510184ff6558884628e37063b5">f25687b</a>)</li>
</ul>
<h3>chore</h3>
<ul>
<li>
<p><strong>build:</strong> fix gulp async task completion (<a href="https://github.com/ReactiveX/IxJS/commit/c68e97c91d565baf33de989d3f6190a8b5046adc">c68e97c</a>)</p>
</li>
<li>
<p>Fix exports (<a href="https://redirect.github.com/ReactiveX/IxJS/issues/371">#371</a>) (<a href="https://github.com/ReactiveX/IxJS/commit/d461eae02cc63bafc4e3256d5d59541d1ff7e43f">d461eae</a>), closes <a href="https://redirect.github.com/ReactiveX/IxJS/issues/371">#371</a></p>
</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/ReactiveX/IxJS/commit/b8890f1010347ac2242398801d1405e51a4d3396"><code>b8890f1</code></a> chore(release): 7.0.0</li>
<li><a href="https://github.com/ReactiveX/IxJS/commit/28a24f945dc2f5ba05669399713b6e2299ebb28c"><code>28a24f9</code></a> docs(CHANGELOG): 7.0.0</li>
<li><a href="https://github.com/ReactiveX/IxJS/commit/f25687b46a547b510184ff6558884628e37063b5"><code>f25687b</code></a> fix(changelog): include latest changelog in npm packages</li>
<li><a href="https://github.com/ReactiveX/IxJS/commit/c68e97c91d565baf33de989d3f6190a8b5046adc"><code>c68e97c</code></a> chore(build): fix gulp async task completion</li>
<li><a href="https://github.com/ReactiveX/IxJS/commit/512e370f428a970f1f55dbf3a0dcd528e3bbc0ba"><code>512e370</code></a> fix(actions): fix docs workflow</li>
<li><a href="https://github.com/ReactiveX/IxJS/commit/d461eae02cc63bafc4e3256d5d59541d1ff7e43f"><code>d461eae</code></a> Fix exports (<a href="https://redirect.github.com/ReactiveX/IxJS/issues/371">#371</a>)</li>
<li>See full diff in <a href="https://github.com/ReactiveX/IxJS/compare/v6.0.0...v7.0.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=ix&package-manager=npm_and_yarn&previous-version=6.0.0&new-version=7.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 js/package.json | 2 +-
 js/yarn.lock    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/js/package.json b/js/package.json
index cbf0670e018..1688747d242 100644
--- a/js/package.json
+++ b/js/package.json
@@ -95,7 +95,7 @@
     "gulp-terser": "2.1.0",
     "gulp-typescript": "5.0.1",
     "gulp-vinyl-size": "1.1.4",
-    "ix": "6.0.0",
+    "ix": "7.0.0",
     "jest": "29.7.0",
     "jest-silent-reporter": "0.6.0",
     "memfs": "4.9.2",
diff --git a/js/yarn.lock b/js/yarn.lock
index b4e208b4a61..d1a089501a3 100644
--- a/js/yarn.lock
+++ b/js/yarn.lock
@@ -4381,10 +4381,10 @@ istextorbinary@^3.0.0:
     binaryextensions "^2.2.0"
     textextensions "^3.2.0"
 
-ix@6.0.0:
-  version "6.0.0"
-  resolved "https://registry.yarnpkg.com/ix/-/ix-6.0.0.tgz#c1875523f8090c7146dc3ac3412a763663887f27"
-  integrity sha512-B/KeYkHtOWbr3ttckqWT9uha2ixw9fGVDxX+DwVXhO+P5eOhyCadt+aC30hRBvG+do+tbI3xbYDMYN6dp1C4Vw==
+ix@7.0.0:
+  version "7.0.0"
+  resolved "https://registry.yarnpkg.com/ix/-/ix-7.0.0.tgz#df4c9a242614178f0836aa3cd1965441fae301d1"
+  integrity sha512-hgVnphYh+ytIEsmjeym5wP2GPaM3+RZf7zCrZXE7gjwwmpIBEg0t6GRX7BbdXzTosXCstEAzdPxpyplGBYnIbw==
   dependencies:
     "@types/node" ">=13.7.4"
     tslib "^2.6.2"

From 2ffb186cf0b1f226188d5ddc88f038e0504b97ea Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 09:22:19 +0900
Subject: [PATCH 101/157] MINOR: [JS] Bump @typescript-eslint/eslint-plugin
 from 7.12.0 to 7.18.0 in /js (#43900)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [@ typescript-eslint/eslint-plugin](https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin) from 7.12.0 to 7.18.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/typescript-eslint/typescript-eslint/releases"><code>@​typescript-eslint/eslint-plugin</code>'s releases</a>.</em></p>
<blockquote>
<h2>v7.18.0</h2>
<h2>7.18.0 (2024-07-29)</h2>
<h3>🚀 Features</h3>
<ul>
<li><strong>types:</strong> update ECMA versions (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9634">#9634</a>)</li>
</ul>
<h3>🩹 Fixes</h3>
<ul>
<li><strong>eslint-plugin:</strong> [no-unnecessary-type-assertion] prevent runtime error when asserting a variable declared in default TS lib (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9660">#9660</a>)</li>
<li><strong>eslint-plugin:</strong> [unbound-method] report on destructuring in function parameters (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/8952">#8952</a>)</li>
<li><strong>eslint-plugin:</strong> [no-duplicate-type-constituents] shouldn't report on error types (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9600">#9600</a>)</li>
<li><strong>eslint-plugin:</strong> [strict-boolean-expressions] support branded booleans (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9297">#9297</a>)</li>
</ul>
<h3>❤️  Thank You</h3>
<ul>
<li>auvred <a href="https://github.com/auvred"><code>@​auvred</code></a></li>
<li>Oliver Salzburg</li>
<li>Vinccool96</li>
<li>Yukihiro Hasegawa <a href="https://github.com/y-hsgw"><code>@​y-hsgw</code></a></li>
</ul>
<p>You can read about our <a href="https://main--typescript-eslint.netlify.app/users/versioning">versioning strategy</a> and <a href="https://main--typescript-eslint.netlify.app/users/releases">releases</a> on our website.</p>
<h2>v7.17.0</h2>
<h2>7.17.0 (2024-07-22)</h2>
<h3>🚀 Features</h3>
<ul>
<li><strong>eslint-plugin:</strong> backport no-unsafe-function type, no-wrapper-object-types from v8 to v7 (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9507">#9507</a>)</li>
<li><strong>eslint-plugin:</strong> [return-await] add option to report in error-handling scenarios only, and deprecate &quot;never&quot; (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9364">#9364</a>)</li>
</ul>
<h3>🩹 Fixes</h3>
<ul>
<li><strong>eslint-plugin:</strong> [no-floating-promises] check top-level type assertions (and more) (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9043">#9043</a>)</li>
<li><strong>eslint-plugin:</strong> [strict-boolean-expressions] consider assertion function argument a boolean context (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9074">#9074</a>)</li>
<li><strong>eslint-plugin:</strong> [no-unnecessary-condition] false positive on optional private field (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9602">#9602</a>)</li>
<li><strong>typescript-estree:</strong> don't infer single-run when --fix is in proces.argv (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9577">#9577</a>)</li>
<li><strong>typescript-estree:</strong> disable single-run inference with extraFileExtensions (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9580">#9580</a>)</li>
<li><strong>website:</strong> expose ATA types to eslint instance (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9598">#9598</a>)</li>
</ul>
<h3>❤️  Thank You</h3>
<ul>
<li>Armano <a href="https://github.com/armano2"><code>@​armano2</code></a></li>
<li>Josh Goldberg ✨</li>
<li>Kirk Waiblinger <a href="https://github.com/kirkwaiblinger"><code>@​kirkwaiblinger</code></a></li>
<li>StyleShit <a href="https://github.com/StyleShit"><code>@​StyleShit</code></a></li>
</ul>
<p>You can read about our <a href="https://main--typescript-eslint.netlify.app/users/versioning">versioning strategy</a> and <a href="https://main--typescript-eslint.netlify.app/users/releases">releases</a> on our website.</p>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/typescript-eslint/typescript-eslint/blob/main/packages/eslint-plugin/CHANGELOG.md"><code>@​typescript-eslint/eslint-plugin</code>'s changelog</a>.</em></p>
<blockquote>
<h2>7.18.0 (2024-07-29)</h2>
<h3>🩹 Fixes</h3>
<ul>
<li>
<p><strong>eslint-plugin:</strong> [no-unnecessary-type-assertion] prevent runtime error when asserting a variable declared in default TS lib</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [unbound-method] report on destructuring in function parameters</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [no-duplicate-type-constituents] shouldn't report on error types</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [strict-boolean-expressions] support branded booleans</p>
</li>
</ul>
<h3>❤️  Thank You</h3>
<ul>
<li>auvred</li>
<li>Oliver Salzburg</li>
<li>Vinccool96</li>
<li>Yukihiro Hasegawa</li>
</ul>
<p>You can read about our <a href="https://main--typescript-eslint.netlify.app/users/versioning">versioning strategy</a> and <a href="https://main--typescript-eslint.netlify.app/users/releases">releases</a> on our website.</p>
<h2>7.17.0 (2024-07-22)</h2>
<h3>🚀 Features</h3>
<ul>
<li>
<p><strong>eslint-plugin:</strong> backport no-unsafe-function type, no-wrapper-object-types from v8 to v7</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [return-await] add option to report in error-handling scenarios only, and deprecate &quot;never&quot;</p>
</li>
</ul>
<h3>🩹 Fixes</h3>
<ul>
<li>
<p><strong>eslint-plugin:</strong> [no-floating-promises] check top-level type assertions (and more)</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [strict-boolean-expressions] consider assertion function argument a boolean context</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [no-unnecessary-condition] false positive on optional private field</p>
</li>
</ul>
<h3>❤️  Thank You</h3>
<ul>
<li>Armano</li>
<li>Josh Goldberg ✨</li>
<li>Kirk Waiblinger</li>
<li>StyleShit</li>
</ul>
<p>You can read about our <a href="https://main--typescript-eslint.netlify.app/users/versioning">versioning strategy</a> and <a href="https://main--typescript-eslint.netlify.app/users/releases">releases</a> on our website.</p>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/35cf3d2b2b9611c3812b120c461d863c7881ac04"><code>35cf3d2</code></a> chore(release): publish 7.18.0</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/46a5709e434a0a252a4ffd5bfe32bf883adbb418"><code>46a5709</code></a> docs: link no-duplicate-type-constituents and no-redundant-type-constituents ...</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/9eec7903698a98f61ddb933b7209d126e3400bb1"><code>9eec790</code></a> fix(eslint-plugin): [strict-boolean-expressions] support branded booleans (<a href="https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin/issues/9">#9</a>...</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/9927a29eb83ce43bb6ecedbd0943207543eadc80"><code>9927a29</code></a> docs: add ast-spec, type-utils docs with docusaurus-plugin-typedoc (<a href="https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin/issues/9293">#9293</a>)</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/fb0ca4cbe79cd4b27300a42b31d6a7f5ea13e8e8"><code>fb0ca4c</code></a> docs: remove unnecessary v8 links (<a href="https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin/issues/9611">#9611</a>)</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/3591b78dc70592684a263755430477e74c7a5133"><code>3591b78</code></a> fix(eslint-plugin): [no-duplicate-type-constituents] shouldn't report on erro...</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/64b4e43112eb52de79c6ad6454d0b243cfc1fc21"><code>64b4e43</code></a> fix(eslint-plugin): [unbound-method] report on destructuring in function para...</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/bf4abdf3ce9454c8a291e78f32994c721fb5fe82"><code>bf4abdf</code></a> fix(eslint-plugin): [no-unnecessary-type-assertion] prevent runtime error whe...</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/6b92aa5ce61d86869493b764f77d882bb4d14ce7"><code>6b92aa5</code></a> chore: reorg repo level utils, lint and typecheck repo files (<a href="https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin/issues/9618">#9618</a>)</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/1e32db13dbf3c73423254f425662ed874f0b62b6"><code>1e32db1</code></a> chore: enable radix (<a href="https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin/issues/9563">#9563</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/typescript-eslint/typescript-eslint/commits/v7.18.0/packages/eslint-plugin">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ typescript-eslint/eslint-plugin&package-manager=npm_and_yarn&previous-version=7.12.0&new-version=7.18.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 js/package.json |   2 +-
 js/yarn.lock    | 108 ++++++++++++++++++++++++------------------------
 2 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/js/package.json b/js/package.json
index 1688747d242..d8a784b784d 100644
--- a/js/package.json
+++ b/js/package.json
@@ -72,7 +72,7 @@
     "@types/glob": "8.1.0",
     "@types/jest": "29.5.12",
     "@types/multistream": "4.1.3",
-    "@typescript-eslint/eslint-plugin": "7.12.0",
+    "@typescript-eslint/eslint-plugin": "7.18.0",
     "@typescript-eslint/parser": "7.14.1",
     "async-done": "2.0.0",
     "benny": "3.7.1",
diff --git a/js/yarn.lock b/js/yarn.lock
index d1a089501a3..e8223fba9aa 100644
--- a/js/yarn.lock
+++ b/js/yarn.lock
@@ -1421,16 +1421,16 @@
   dependencies:
     "@types/yargs-parser" "*"
 
-"@typescript-eslint/eslint-plugin@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.12.0.tgz#f87a32e8972b8a60024f2f8f12205e7c8108bc41"
-  integrity sha512-7F91fcbuDf/d3S8o21+r3ZncGIke/+eWk0EpO21LXhDfLahriZF9CGj4fbAetEjlaBdjdSm9a6VeXbpbT6Z40Q==
+"@typescript-eslint/eslint-plugin@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.18.0.tgz#b16d3cf3ee76bf572fdf511e79c248bdec619ea3"
+  integrity sha512-94EQTWZ40mzBc42ATNIBimBEDltSJ9RQHCC8vc/PDbxi4k8dVwUAv4o98dk50M1zB+JGFxp43FP7f8+FP8R6Sw==
   dependencies:
     "@eslint-community/regexpp" "^4.10.0"
-    "@typescript-eslint/scope-manager" "7.12.0"
-    "@typescript-eslint/type-utils" "7.12.0"
-    "@typescript-eslint/utils" "7.12.0"
-    "@typescript-eslint/visitor-keys" "7.12.0"
+    "@typescript-eslint/scope-manager" "7.18.0"
+    "@typescript-eslint/type-utils" "7.18.0"
+    "@typescript-eslint/utils" "7.18.0"
+    "@typescript-eslint/visitor-keys" "7.18.0"
     graphemer "^1.4.0"
     ignore "^5.3.1"
     natural-compare "^1.4.0"
@@ -1447,14 +1447,6 @@
     "@typescript-eslint/visitor-keys" "7.14.1"
     debug "^4.3.4"
 
-"@typescript-eslint/scope-manager@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.12.0.tgz#259c014362de72dd34f995efe6bd8dda486adf58"
-  integrity sha512-itF1pTnN6F3unPak+kutH9raIkL3lhH1YRPGgt7QQOh43DQKVJXmWkpb+vpc/TiDHs6RSd9CTbDsc/Y+Ygq7kg==
-  dependencies:
-    "@typescript-eslint/types" "7.12.0"
-    "@typescript-eslint/visitor-keys" "7.12.0"
-
 "@typescript-eslint/scope-manager@7.14.1":
   version "7.14.1"
   resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.14.1.tgz#63de7a577bc6fe8ee6e412a5b85499f654b93ee5"
@@ -1463,39 +1455,33 @@
     "@typescript-eslint/types" "7.14.1"
     "@typescript-eslint/visitor-keys" "7.14.1"
 
-"@typescript-eslint/type-utils@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/type-utils/-/type-utils-7.12.0.tgz#9dfaaa1972952f395ec5be4f5bbfc4d3cdc63908"
-  integrity sha512-lib96tyRtMhLxwauDWUp/uW3FMhLA6D0rJ8T7HmH7x23Gk1Gwwu8UZ94NMXBvOELn6flSPiBrCKlehkiXyaqwA==
+"@typescript-eslint/scope-manager@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.18.0.tgz#c928e7a9fc2c0b3ed92ab3112c614d6bd9951c83"
+  integrity sha512-jjhdIE/FPF2B7Z1uzc6i3oWKbGcHb87Qw7AWj6jmEqNOfDFbJWtjt/XfwCpvNkpGWlcJaog5vTR+VV8+w9JflA==
+  dependencies:
+    "@typescript-eslint/types" "7.18.0"
+    "@typescript-eslint/visitor-keys" "7.18.0"
+
+"@typescript-eslint/type-utils@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/type-utils/-/type-utils-7.18.0.tgz#2165ffaee00b1fbbdd2d40aa85232dab6998f53b"
+  integrity sha512-XL0FJXuCLaDuX2sYqZUUSOJ2sG5/i1AAze+axqmLnSkNEVMVYLF+cbwlB2w8D1tinFuSikHmFta+P+HOofrLeA==
   dependencies:
-    "@typescript-eslint/typescript-estree" "7.12.0"
-    "@typescript-eslint/utils" "7.12.0"
+    "@typescript-eslint/typescript-estree" "7.18.0"
+    "@typescript-eslint/utils" "7.18.0"
     debug "^4.3.4"
     ts-api-utils "^1.3.0"
 
-"@typescript-eslint/types@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.12.0.tgz#bf208f971a8da1e7524a5d9ae2b5f15192a37981"
-  integrity sha512-o+0Te6eWp2ppKY3mLCU+YA9pVJxhUJE15FV7kxuD9jgwIAa+w/ycGJBMrYDTpVGUM/tgpa9SeMOugSabWFq7bg==
-
 "@typescript-eslint/types@7.14.1":
   version "7.14.1"
   resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.14.1.tgz#a43a540dbe5df7f2a11269683d777fc50b4350aa"
   integrity sha512-mL7zNEOQybo5R3AavY+Am7KLv8BorIv7HCYS5rKoNZKQD9tsfGUpO4KdAn3sSUvTiS4PQkr2+K0KJbxj8H9NDg==
 
-"@typescript-eslint/typescript-estree@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.12.0.tgz#e6c1074f248b3db6573ab6a7c47a39c4cd498ff9"
-  integrity sha512-5bwqLsWBULv1h6pn7cMW5dXX/Y2amRqLaKqsASVwbBHMZSnHqE/HN4vT4fE0aFsiwxYvr98kqOWh1a8ZKXalCQ==
-  dependencies:
-    "@typescript-eslint/types" "7.12.0"
-    "@typescript-eslint/visitor-keys" "7.12.0"
-    debug "^4.3.4"
-    globby "^11.1.0"
-    is-glob "^4.0.3"
-    minimatch "^9.0.4"
-    semver "^7.6.0"
-    ts-api-utils "^1.3.0"
+"@typescript-eslint/types@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.18.0.tgz#b90a57ccdea71797ffffa0321e744f379ec838c9"
+  integrity sha512-iZqi+Ds1y4EDYUtlOOC+aUmxnE9xS/yCigkjA7XpTKV6nCBd3Hp/PRGGmdwnfkV2ThMyYldP1wRpm/id99spTQ==
 
 "@typescript-eslint/typescript-estree@7.14.1":
   version "7.14.1"
@@ -1511,23 +1497,29 @@
     semver "^7.6.0"
     ts-api-utils "^1.3.0"
 
-"@typescript-eslint/utils@7.12.0", "@typescript-eslint/utils@^6.0.0 || ^7.0.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-7.12.0.tgz#c6e58fd7f724cdccc848f71e388ad80cbdb95dd0"
-  integrity sha512-Y6hhwxwDx41HNpjuYswYp6gDbkiZ8Hin9Bf5aJQn1bpTs3afYY4GX+MPYxma8jtoIV2GRwTM/UJm/2uGCVv+DQ==
+"@typescript-eslint/typescript-estree@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.18.0.tgz#b5868d486c51ce8f312309ba79bdb9f331b37931"
+  integrity sha512-aP1v/BSPnnyhMHts8cf1qQ6Q1IFwwRvAQGRvBFkWlo3/lH29OXA3Pts+c10nxRxIBrDnoMqzhgdwVe5f2D6OzA==
   dependencies:
-    "@eslint-community/eslint-utils" "^4.4.0"
-    "@typescript-eslint/scope-manager" "7.12.0"
-    "@typescript-eslint/types" "7.12.0"
-    "@typescript-eslint/typescript-estree" "7.12.0"
+    "@typescript-eslint/types" "7.18.0"
+    "@typescript-eslint/visitor-keys" "7.18.0"
+    debug "^4.3.4"
+    globby "^11.1.0"
+    is-glob "^4.0.3"
+    minimatch "^9.0.4"
+    semver "^7.6.0"
+    ts-api-utils "^1.3.0"
 
-"@typescript-eslint/visitor-keys@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.12.0.tgz#c053b55a996679528beeedd8e565710ce1ae1ad3"
-  integrity sha512-uZk7DevrQLL3vSnfFl5bj4sL75qC9D6EdjemIdbtkuUmIheWpuiiylSY01JxJE7+zGrOWDZrp1WxOuDntvKrHQ==
+"@typescript-eslint/utils@7.18.0", "@typescript-eslint/utils@^6.0.0 || ^7.0.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-7.18.0.tgz#bca01cde77f95fc6a8d5b0dbcbfb3d6ca4be451f"
+  integrity sha512-kK0/rNa2j74XuHVcoCZxdFBMF+aq/vH83CXAOHieC+2Gis4mF8jJXT5eAfyD3K0sAxtPuwxaIOIOvhwzVDt/kw==
   dependencies:
-    "@typescript-eslint/types" "7.12.0"
-    eslint-visitor-keys "^3.4.3"
+    "@eslint-community/eslint-utils" "^4.4.0"
+    "@typescript-eslint/scope-manager" "7.18.0"
+    "@typescript-eslint/types" "7.18.0"
+    "@typescript-eslint/typescript-estree" "7.18.0"
 
 "@typescript-eslint/visitor-keys@7.14.1":
   version "7.14.1"
@@ -1537,6 +1529,14 @@
     "@typescript-eslint/types" "7.14.1"
     eslint-visitor-keys "^3.4.3"
 
+"@typescript-eslint/visitor-keys@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.18.0.tgz#0564629b6124d67607378d0f0332a0495b25e7d7"
+  integrity sha512-cDF0/Gf81QpY3xYyJKDV14Zwdmid5+uuENhjH2EqFaF0ni+yAyq/LzMaIJdhNJXZI7uLzwIlA+V7oWoyn6Curg==
+  dependencies:
+    "@typescript-eslint/types" "7.18.0"
+    eslint-visitor-keys "^3.4.3"
+
 "@ungap/structured-clone@^1.2.0":
   version "1.2.0"
   resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"

From 7f88ae7d5e7f18911c36630ac0ebd17ae78ab686 Mon Sep 17 00:00:00 2001
From: Max Feinleib <82004873+feinleib@users.noreply.github.com>
Date: Mon, 2 Sep 2024 10:03:09 -0400
Subject: [PATCH 102/157] MINOR: [R] Fix monospace formatting in
 dplyr-funcs-doc (#43461)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added a closing backtick for the `.groups` argument in `summarise()`.

### Rationale for this change

Improves the formatting and appearance of the Acero documentation page at <https://arrow.apache.org/docs/r/reference/acero.html>.

### Are these changes tested?

Yes, I ran `devtools::check()` on this change. I got one warning (related to my environment) and one note that isn't going to be fixable. I would consider this result "passing."

```
❯ checking top-level files ... WARNING
  A complete check needs the 'checkbashisms' script.
  See section ‘Configure and cleanup’ in the ‘Writing R Extensions’
  manual.

❯ checking installed package size ... NOTE
    installed size is 54.7Mb
    sub-directories of 1Mb or more:
      R      5.1Mb
      libs  49.0Mb

0 errors ✔ | 1 warning ✖ | 1 note ✖
```

### Are there any user-facing changes?

This is a documentation change.

Authored-by: Max Feinleib <82004873+feinleib@users.noreply.github.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/R/arrow-package.R   | 2 +-
 r/R/dplyr-funcs-doc.R | 2 +-
 r/man/acero.Rd        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 44dfbbcd5c7..4c3b78e085c 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -41,7 +41,7 @@ supported_dplyr_methods <- list(
   collect = NULL,
   summarise = c(
     "window functions not currently supported;",
-    'arguments `.drop = FALSE` and `.groups = "rowwise" not supported'
+    'arguments `.drop = FALSE` and `.groups = "rowwise"` not supported'
   ),
   group_by = NULL,
   groups = NULL,
diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R
index 7f0627c33d0..4f90dd16b26 100644
--- a/r/R/dplyr-funcs-doc.R
+++ b/r/R/dplyr-funcs-doc.R
@@ -67,7 +67,7 @@
 #' * [`slice_min()`][dplyr::slice_min()]: slicing within groups not supported; `with_ties = TRUE` (dplyr default) is not supported; `prop` only supported on queries where `nrow()` is knowable without evaluating
 #' * [`slice_sample()`][dplyr::slice_sample()]: slicing within groups not supported; `replace = TRUE` and the `weight_by` argument not supported; `n` only supported on queries where `nrow()` is knowable without evaluating
 #' * [`slice_tail()`][dplyr::slice_tail()]: slicing within groups not supported; Arrow datasets do not have row order, so tail is non-deterministic; `prop` only supported on queries where `nrow()` is knowable without evaluating
-#' * [`summarise()`][dplyr::summarise()]: window functions not currently supported; arguments `.drop = FALSE` and `.groups = "rowwise" not supported
+#' * [`summarise()`][dplyr::summarise()]: window functions not currently supported; arguments `.drop = FALSE` and `.groups = "rowwise"` not supported
 #' * [`tally()`][dplyr::tally()]
 #' * [`transmute()`][dplyr::transmute()]
 #' * [`ungroup()`][dplyr::ungroup()]
diff --git a/r/man/acero.Rd b/r/man/acero.Rd
index 9ef9cd7dda6..aceb533a151 100644
--- a/r/man/acero.Rd
+++ b/r/man/acero.Rd
@@ -54,7 +54,7 @@ Table into an R \code{tibble}.
 \item \code{\link[dplyr:slice]{slice_min()}}: slicing within groups not supported; \code{with_ties = TRUE} (dplyr default) is not supported; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating
 \item \code{\link[dplyr:slice]{slice_sample()}}: slicing within groups not supported; \code{replace = TRUE} and the \code{weight_by} argument not supported; \code{n} only supported on queries where \code{nrow()} is knowable without evaluating
 \item \code{\link[dplyr:slice]{slice_tail()}}: slicing within groups not supported; Arrow datasets do not have row order, so tail is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating
-\item \code{\link[dplyr:summarise]{summarise()}}: window functions not currently supported; arguments \code{.drop = FALSE} and `.groups = "rowwise" not supported
+\item \code{\link[dplyr:summarise]{summarise()}}: window functions not currently supported; arguments \code{.drop = FALSE} and \code{.groups = "rowwise"} not supported
 \item \code{\link[dplyr:count]{tally()}}
 \item \code{\link[dplyr:transmute]{transmute()}}
 \item \code{\link[dplyr:group_by]{ungroup()}}

From a8df190a43b0ddbb2009cd55b54f4cbb4d9c3377 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Mon, 2 Sep 2024 10:03:49 -0400
Subject: [PATCH 103/157] GH-43894: [R] format_aggregation() should print
 options too (#43896)

### Rationale for this change

If you printed the inner query after summarize, it would show what function was being called but not the function options.

### What changes are included in this PR?

One-line code change plus a test

### Are these changes tested?

Yes. Interestingly, it did not seem that `format_aggregations()` was tested before.

### Are there any user-facing changes?

Technically yes, but few users would likely see this.
* GitHub Issue: #43894

Authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/R/dplyr-summarize.R                   |  2 +-
 r/tests/testthat/test-dplyr-summarize.R | 38 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index a9ad750de7c..42fd245e5ab 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -241,7 +241,7 @@ group_types <- function(.data, schema = NULL) {
 }
 
 format_aggregation <- function(x) {
-  paste0(x$fun, "(", paste(map(x$data, ~ .$ToString()), collapse = ","), ")")
+  Expression$create(x$fun, args = x$data, options = x$options)$ToString()
 }
 
 # This function evaluates an expression and returns the post-summarize
diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index 95212407acf..8d2a209df54 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -955,6 +955,44 @@ test_that("Summarize with 0 arguments", {
   )
 })
 
+test_that("Printing aggregation expressions", {
+  q <- tbl |>
+    arrow_table() |>
+    summarize(
+      total = sum(int, na.rm = TRUE),
+      prod = prod(int, na.rm = TRUE),
+      any = any(lgl, na.rm = TRUE),
+      all = all(lgl, na.rm = TRUE),
+      mean = mean(int, na.rm = TRUE),
+      sd = sd(int, na.rm = TRUE),
+      var = var(int, na.rm = TRUE),
+      n_distinct = n_distinct(chr),
+      min = min(int, na.rm = TRUE),
+      max = max(int, na.rm = TRUE)
+    )
+  expect_output(
+    print(q$.data),
+    "Table (query)
+int: int32
+lgl: bool
+chr: string
+
+* Aggregations:
+total: sum(int, {skip_nulls=true, min_count=0})
+prod: product(int, {skip_nulls=true, min_count=0})
+any: any(lgl, {skip_nulls=true, min_count=0})
+all: all(lgl, {skip_nulls=true, min_count=0})
+mean: mean(int, {skip_nulls=true, min_count=0})
+sd: stddev(int, {ddof=1, skip_nulls=true, min_count=0})
+var: variance(int, {ddof=1, skip_nulls=true, min_count=0})
+n_distinct: count_distinct(chr, {mode=ALL})
+min: min(int, {skip_nulls=true, min_count=0})
+max: max(int, {skip_nulls=true, min_count=0})
+See $.data for the source Arrow object",
+    fixed = TRUE
+  )
+})
+
 test_that("Not supported: window functions", {
   compare_dplyr_binding(
     .input %>%

From 9ab9532a208d5632b0f8b5035a207235b5e6b828 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Mon, 2 Sep 2024 16:35:26 +0200
Subject: [PATCH 104/157] GH-25118: [Python] Make NumPy an optional runtime
 dependency   (#41904)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

Being able to run pyarrow without requiring numpy.

### What changes are included in this PR?

If numpy is not present we are able to import pyarrow and run functionality.
A new CI job has been created to run some basic tests without numpy.

### Are these changes tested?

Yes via CI.

### Are there any user-facing changes?

Yes, NumPy can be removed from the user installation and pyarrow functionality still works

* GitHub Issue: #25118

Lead-authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .github/workflows/python.yml                  |   6 +
 docker-compose.yml                            |  32 ++++
 python/CMakeLists.txt                         |   4 +-
 python/pyarrow/_compute.pyx                   |  16 +-
 python/pyarrow/array.pxi                      |   5 +
 python/pyarrow/builder.pxi                    |  14 +-
 python/pyarrow/conftest.py                    |  13 +-
 python/pyarrow/includes/libarrow_python.pxd   |   2 +-
 python/pyarrow/lib.pyx                        |  12 +-
 python/pyarrow/pandas_compat.py               |  79 +++++----
 python/pyarrow/src/arrow/python/inference.cc  |   4 +-
 python/pyarrow/src/arrow/python/iterators.h   |   6 +-
 .../arrow/python/{init.cc => numpy_init.cc}   |  13 +-
 .../src/arrow/python/{init.h => numpy_init.h} |   5 +-
 .../pyarrow/src/arrow/python/numpy_internal.h |  19 ++-
 .../pyarrow/src/arrow/python/python_test.cc   |   2 +-
 .../src/arrow/python/python_to_arrow.cc       |  11 +-
 python/pyarrow/table.pxi                      |   3 +
 python/pyarrow/tensor.pxi                     |  15 ++
 python/pyarrow/tests/conftest.py              |   1 +
 .../tests/interchange/test_conversion.py      |  35 ++--
 .../interchange/test_interchange_spec.py      |  33 ++--
 python/pyarrow/tests/parquet/common.py        |   5 +-
 python/pyarrow/tests/parquet/test_basic.py    |   5 +-
 .../pyarrow/tests/parquet/test_data_types.py  |  13 +-
 python/pyarrow/tests/parquet/test_dataset.py  |   5 +-
 python/pyarrow/tests/parquet/test_datetime.py |   5 +-
 python/pyarrow/tests/parquet/test_metadata.py |   7 +-
 python/pyarrow/tests/parquet/test_pandas.py   |   5 +-
 python/pyarrow/tests/strategies.py            |  10 +-
 .../pyarrow/tests/test_adhoc_memory_leak.py   |   5 +-
 python/pyarrow/tests/test_array.py            | 100 +++++++++--
 python/pyarrow/tests/test_builder.py          |  11 +-
 python/pyarrow/tests/test_compute.py          |  85 ++++++----
 python/pyarrow/tests/test_convert_builtin.py  | 155 +++++++++++-------
 python/pyarrow/tests/test_cpp_internals.py    |   8 +
 python/pyarrow/tests/test_csv.py              |  44 ++++-
 python/pyarrow/tests/test_cuda.py             |   5 +-
 .../pyarrow/tests/test_cuda_numba_interop.py  |   5 +-
 python/pyarrow/tests/test_cython.py           |   4 +
 python/pyarrow/tests/test_dataset.py          |  55 ++++---
 .../pyarrow/tests/test_dataset_encryption.py  |   7 +-
 python/pyarrow/tests/test_dlpack.py           |  46 +++---
 python/pyarrow/tests/test_extension_type.py   |  77 ++++++---
 python/pyarrow/tests/test_feather.py          |  10 +-
 python/pyarrow/tests/test_flight.py           |   6 +-
 python/pyarrow/tests/test_io.py               |  38 +++--
 python/pyarrow/tests/test_ipc.py              |  10 +-
 python/pyarrow/tests/test_json.py             |   8 +-
 python/pyarrow/tests/test_pandas.py           |  62 +++----
 python/pyarrow/tests/test_scalars.py          |  59 +++++--
 python/pyarrow/tests/test_schema.py           |   6 +-
 python/pyarrow/tests/test_sparse_tensor.py    |   5 +-
 python/pyarrow/tests/test_strategies.py       |   5 +
 python/pyarrow/tests/test_substrait.py        |   2 +
 python/pyarrow/tests/test_table.py            |  29 +++-
 python/pyarrow/tests/test_tensor.py           |   5 +-
 python/pyarrow/tests/test_types.py            |  16 +-
 python/pyarrow/tests/test_udf.py              |  13 +-
 python/pyarrow/tests/test_without_numpy.py    |  58 +++++++
 python/pyarrow/tests/util.py                  |  19 +--
 python/pyarrow/types.pxi                      |  85 +++++-----
 62 files changed, 1008 insertions(+), 420 deletions(-)
 rename python/pyarrow/src/arrow/python/{init.cc => numpy_init.cc} (78%)
 rename python/pyarrow/src/arrow/python/{init.h => numpy_init.h} (93%)
 create mode 100644 python/pyarrow/tests/test_without_numpy.py

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 854d792f310..90d3a50af37 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -59,6 +59,7 @@ jobs:
           - conda-python-3.9-nopandas
           - conda-python-3.8-pandas-1.0
           - conda-python-3.10-pandas-latest
+          - conda-python-3.10-no-numpy
         include:
           - name: conda-python-docs
             cache: conda-python-3.9
@@ -83,6 +84,11 @@ jobs:
             title: AMD64 Conda Python 3.10 Pandas latest
             python: "3.10"
             pandas: latest
+          - name: conda-python-3.10-no-numpy
+            cache: conda-python-3.10
+            image: conda-python-no-numpy
+            title: AMD64 Conda Python 3.10 without NumPy
+            python: "3.10"
     env:
       PYTHON: ${{ matrix.python || 3.8 }}
       UBUNTU: ${{ matrix.ubuntu || 20.04 }}
diff --git a/docker-compose.yml b/docker-compose.yml
index 3045cf015bc..97d6e1158ea 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -126,6 +126,7 @@ x-hierarchy:
         - conda-python-hdfs
         - conda-python-java-integration
         - conda-python-jpype
+        - conda-python-no-numpy
         - conda-python-spark
         - conda-python-substrait
   - conda-verify-rc
@@ -1258,6 +1259,37 @@ services:
     volumes: *conda-volumes
     command: *python-conda-command
 
+  conda-python-no-numpy:
+    # Usage:
+    #   docker-compose build conda
+    #   docker-compose build conda-cpp
+    #   docker-compose build conda-python
+    #   docker-compose build conda-python-no-numpy
+    #   docker-compose run --rm conda-python-no-numpy
+    image: ${REPO}:${ARCH}-conda-python-${PYTHON}-no-numpy
+    build:
+      context: .
+      dockerfile: ci/docker/conda-python.dockerfile
+      cache_from:
+        - ${REPO}:${ARCH}-conda-python-${PYTHON}
+      args:
+        repo: ${REPO}
+        arch: ${ARCH}
+        python: ${PYTHON}
+    shm_size: *shm-size
+    environment:
+      <<: [*common, *ccache, *sccache]
+      PARQUET_REQUIRE_ENCRYPTION:  # inherit
+      HYPOTHESIS_PROFILE:  # inherit
+      PYARROW_TEST_HYPOTHESIS:  # inherit
+    volumes: *conda-volumes
+    command:
+      ["
+        /arrow/ci/scripts/cpp_build.sh /arrow /build &&
+        /arrow/ci/scripts/python_build.sh /arrow /build &&
+        mamba uninstall -y numpy &&
+        /arrow/ci/scripts/python_test.sh /arrow"]
+
   conda-python-docs:
     # Usage:
     #   archery docker run conda-python-docs
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 1a18b2b173a..eda4ff4ca5f 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -339,17 +339,17 @@ set(PYARROW_CPP_SRCS
     ${PYARROW_CPP_SOURCE_DIR}/gdb.cc
     ${PYARROW_CPP_SOURCE_DIR}/helpers.cc
     ${PYARROW_CPP_SOURCE_DIR}/inference.cc
-    ${PYARROW_CPP_SOURCE_DIR}/init.cc
     ${PYARROW_CPP_SOURCE_DIR}/io.cc
     ${PYARROW_CPP_SOURCE_DIR}/ipc.cc
     ${PYARROW_CPP_SOURCE_DIR}/numpy_convert.cc
+    ${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc
     ${PYARROW_CPP_SOURCE_DIR}/numpy_to_arrow.cc
     ${PYARROW_CPP_SOURCE_DIR}/python_test.cc
     ${PYARROW_CPP_SOURCE_DIR}/python_to_arrow.cc
     ${PYARROW_CPP_SOURCE_DIR}/pyarrow.cc
     ${PYARROW_CPP_SOURCE_DIR}/serialize.cc
     ${PYARROW_CPP_SOURCE_DIR}/udf.cc)
-set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/init.cc
+set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc
                             PROPERTIES SKIP_PRECOMPILE_HEADERS ON
                                        SKIP_UNITY_BUILD_INCLUSION ON)
 
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index 0e860eaf4c6..d39120934d5 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -33,7 +33,10 @@ from pyarrow.util import _DEPR_MSG
 from libcpp cimport bool as c_bool
 
 import inspect
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import warnings
 
 
@@ -43,6 +46,11 @@ _substrait_msg = (
 )
 
 
+SUPPORTED_INPUT_ARR_TYPES = (list, tuple)
+if np is not None:
+    SUPPORTED_INPUT_ARR_TYPES += (np.ndarray, )
+
+
 def _pas():
     global __pas
     if __pas is None:
@@ -473,7 +481,7 @@ cdef class MetaFunction(Function):
 
 cdef _pack_compute_args(object values, vector[CDatum]* out):
     for val in values:
-        if isinstance(val, (list, np.ndarray)):
+        if isinstance(val, SUPPORTED_INPUT_ARR_TYPES):
             val = lib.asarray(val)
 
         if isinstance(val, Array):
@@ -2189,7 +2197,7 @@ class QuantileOptions(_QuantileOptions):
 
     def __init__(self, q=0.5, *, interpolation="linear", skip_nulls=True,
                  min_count=0):
-        if not isinstance(q, (list, tuple, np.ndarray)):
+        if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES):
             q = [q]
         self._set_options(q, interpolation, skip_nulls, min_count)
 
@@ -2222,7 +2230,7 @@ class TDigestOptions(_TDigestOptions):
 
     def __init__(self, q=0.5, *, delta=100, buffer_size=500, skip_nulls=True,
                  min_count=0):
-        if not isinstance(q, (list, tuple, np.ndarray)):
+        if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES):
             q = [q]
         self._set_options(q, delta, buffer_size, skip_nulls, min_count)
 
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 1587de0e6b7..93c44297590 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -50,6 +50,8 @@ cdef _sequence_to_array(object sequence, object mask, object size,
 
 
 cdef inline _is_array_like(obj):
+    if np is None:
+        return False
     if isinstance(obj, np.ndarray):
         return True
     return pandas_api._have_pandas_internal() and pandas_api.is_array_like(obj)
@@ -1608,6 +1610,9 @@ cdef class Array(_PandasConvertible):
         """
         self._assert_cpu()
 
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef:
             PyObject* out
             PandasOptions c_options
diff --git a/python/pyarrow/builder.pxi b/python/pyarrow/builder.pxi
index 2af39e2c589..fbab5bbdb5a 100644
--- a/python/pyarrow/builder.pxi
+++ b/python/pyarrow/builder.pxi
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import math
+
 
 cdef class StringBuilder(_Weakrefable):
     """
@@ -42,10 +44,10 @@ cdef class StringBuilder(_Weakrefable):
         value : string/bytes or np.nan/None
             The value to append to the string array builder.
         """
-        if value is None or value is np.nan:
-            self.builder.get().AppendNull()
-        elif isinstance(value, (bytes, str)):
+        if isinstance(value, (bytes, str)):
             self.builder.get().Append(tobytes(value))
+        elif value is None or math.isnan(value):
+            self.builder.get().AppendNull()
         else:
             raise TypeError('StringBuilder only accepts string objects')
 
@@ -108,10 +110,10 @@ cdef class StringViewBuilder(_Weakrefable):
         value : string/bytes or np.nan/None
             The value to append to the string array builder.
         """
-        if value is None or value is np.nan:
-            self.builder.get().AppendNull()
-        elif isinstance(value, (bytes, str)):
+        if isinstance(value, (bytes, str)):
             self.builder.get().Append(tobytes(value))
+        elif value is None or math.isnan(value):
+            self.builder.get().AppendNull()
         else:
             raise TypeError('StringViewBuilder only accepts string objects')
 
diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py
index 29c850c142d..10a2e72f923 100644
--- a/python/pyarrow/conftest.py
+++ b/python/pyarrow/conftest.py
@@ -25,7 +25,6 @@
 from pyarrow.tests.util import windows_has_tzdata
 import sys
 
-import numpy as np
 
 groups = [
     'acero',
@@ -46,6 +45,8 @@
     'lz4',
     'memory_leak',
     'nopandas',
+    'nonumpy',
+    'numpy',
     'orc',
     'pandas',
     'parquet',
@@ -81,6 +82,8 @@
     'lz4': Codec.is_available('lz4'),
     'memory_leak': False,
     'nopandas': False,
+    'nonumpy': False,
+    'numpy': False,
     'orc': False,
     'pandas': False,
     'parquet': False,
@@ -158,6 +161,12 @@
 except ImportError:
     defaults['nopandas'] = True
 
+try:
+    import numpy  # noqa
+    defaults['numpy'] = True
+except ImportError:
+    defaults['nonumpy'] = True
+
 try:
     import pyarrow.parquet  # noqa
     defaults['parquet'] = True
@@ -327,6 +336,7 @@ def unary_agg_func_fixture():
     Register a unary aggregate function (mean)
     """
     from pyarrow import compute as pc
+    import numpy as np
 
     def func(ctx, x):
         return pa.scalar(np.nanmean(x))
@@ -352,6 +362,7 @@ def varargs_agg_func_fixture():
     Register a unary aggregate function
     """
     from pyarrow import compute as pc
+    import numpy as np
 
     def func(ctx, *args):
         sum = 0.0
diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd
index 9fcc97aaf0a..96725c9c386 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -248,7 +248,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil:
     CResult[PyObject*] StringToTzinfo(c_string)
 
 
-cdef extern from "arrow/python/init.h":
+cdef extern from "arrow/python/numpy_init.h" namespace "arrow::py":
     int arrow_init_numpy() except -1
 
 
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index c72841c2995..6b82eb65668 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -21,7 +21,10 @@
 
 import datetime
 import decimal as _pydecimal
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import os
 import sys
 
@@ -32,8 +35,11 @@ from pyarrow.includes.common cimport PyObject_to_object
 cimport pyarrow.includes.libarrow_python as libarrow_python
 cimport cpython as cp
 
-# Initialize NumPy C API
-arrow_init_numpy()
+
+# Initialize NumPy C API only if numpy was able to be imported
+if np is not None:
+    arrow_init_numpy()
+
 # Initialize PyArrow C++ API
 # (used from some of our C++ code, see e.g. ARROW-5260)
 import_pyarrow()
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index fcccf564fc6..7fbde36bc23 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -30,13 +30,17 @@
 import re
 import warnings
 
-import numpy as np
-
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pyarrow as pa
 from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled  # noqa
 
 
 _logical_type_map = {}
+_numpy_logical_type_map = {}
+_pandas_logical_type_map = {}
 
 
 def get_logical_type_map():
@@ -85,27 +89,32 @@ def get_logical_type(arrow_type):
         return 'object'
 
 
-_numpy_logical_type_map = {
-    np.bool_: 'bool',
-    np.int8: 'int8',
-    np.int16: 'int16',
-    np.int32: 'int32',
-    np.int64: 'int64',
-    np.uint8: 'uint8',
-    np.uint16: 'uint16',
-    np.uint32: 'uint32',
-    np.uint64: 'uint64',
-    np.float32: 'float32',
-    np.float64: 'float64',
-    'datetime64[D]': 'date',
-    np.str_: 'string',
-    np.bytes_: 'bytes',
-}
+def get_numpy_logical_type_map():
+    global _numpy_logical_type_map
+    if not _numpy_logical_type_map:
+        _numpy_logical_type_map.update({
+            np.bool_: 'bool',
+            np.int8: 'int8',
+            np.int16: 'int16',
+            np.int32: 'int32',
+            np.int64: 'int64',
+            np.uint8: 'uint8',
+            np.uint16: 'uint16',
+            np.uint32: 'uint32',
+            np.uint64: 'uint64',
+            np.float32: 'float32',
+            np.float64: 'float64',
+            'datetime64[D]': 'date',
+            np.str_: 'string',
+            np.bytes_: 'bytes',
+        })
+    return _numpy_logical_type_map
 
 
 def get_logical_type_from_numpy(pandas_collection):
+    numpy_logical_type_map = get_numpy_logical_type_map()
     try:
-        return _numpy_logical_type_map[pandas_collection.dtype.type]
+        return numpy_logical_type_map[pandas_collection.dtype.type]
     except KeyError:
         if hasattr(pandas_collection.dtype, 'tz'):
             return 'datetimetz'
@@ -1023,18 +1032,23 @@ def _is_generated_index_name(name):
     return re.match(pattern, name) is not None
 
 
-_pandas_logical_type_map = {
-    'date': 'datetime64[D]',
-    'datetime': 'datetime64[ns]',
-    'datetimetz': 'datetime64[ns]',
-    'unicode': np.str_,
-    'bytes': np.bytes_,
-    'string': np.str_,
-    'integer': np.int64,
-    'floating': np.float64,
-    'decimal': np.object_,
-    'empty': np.object_,
-}
+def get_pandas_logical_type_map():
+    global _pandas_logical_type_map
+
+    if not _pandas_logical_type_map:
+        _pandas_logical_type_map.update({
+            'date': 'datetime64[D]',
+            'datetime': 'datetime64[ns]',
+            'datetimetz': 'datetime64[ns]',
+            'unicode': np.str_,
+            'bytes': np.bytes_,
+            'string': np.str_,
+            'integer': np.int64,
+            'floating': np.float64,
+            'decimal': np.object_,
+            'empty': np.object_,
+        })
+    return _pandas_logical_type_map
 
 
 def _pandas_type_to_numpy_type(pandas_type):
@@ -1050,8 +1064,9 @@ def _pandas_type_to_numpy_type(pandas_type):
     dtype : np.dtype
         The dtype that corresponds to `pandas_type`.
     """
+    pandas_logical_type_map = get_pandas_logical_type_map()
     try:
-        return _pandas_logical_type_map[pandas_type]
+        return pandas_logical_type_map[pandas_type]
     except KeyError:
         if 'mixed' in pandas_type:
             # catching 'mixed', 'mixed-integer' and 'mixed-integer-float'
diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc
index 10116f9afad..1aa7915ba1e 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -395,11 +395,11 @@ class TypeInferrer {
       *keep_going = make_unions_;
     } else if (arrow::py::is_scalar(obj)) {
       RETURN_NOT_OK(VisitArrowScalar(obj, keep_going));
-    } else if (PyArray_CheckAnyScalarExact(obj)) {
+    } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
       RETURN_NOT_OK(VisitDType(PyArray_DescrFromScalar(obj), keep_going));
     } else if (PySet_Check(obj) || (Py_TYPE(obj) == &PyDictValues_Type)) {
       RETURN_NOT_OK(VisitSet(obj, keep_going));
-    } else if (PyArray_Check(obj)) {
+    } else if (has_numpy() && PyArray_Check(obj)) {
       RETURN_NOT_OK(VisitNdarray(obj, keep_going));
     } else if (PyDict_Check(obj)) {
       RETURN_NOT_OK(VisitDict(obj));
diff --git a/python/pyarrow/src/arrow/python/iterators.h b/python/pyarrow/src/arrow/python/iterators.h
index 7b31962dac5..85122768482 100644
--- a/python/pyarrow/src/arrow/python/iterators.h
+++ b/python/pyarrow/src/arrow/python/iterators.h
@@ -22,6 +22,7 @@
 #include "arrow/array/array_primitive.h"
 
 #include "arrow/python/common.h"
+#include "arrow/python/numpy_init.h"
 #include "arrow/python/numpy_internal.h"
 
 namespace arrow {
@@ -44,7 +45,7 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&&
   // VisitorFunc may set to false to terminate iteration
   bool keep_going = true;
 
-  if (PyArray_Check(obj)) {
+  if (has_numpy() && PyArray_Check(obj)) {
     PyArrayObject* arr_obj = reinterpret_cast<PyArrayObject*>(obj);
     if (PyArray_NDIM(arr_obj) != 1) {
       return Status::Invalid("Only 1D arrays accepted");
@@ -64,6 +65,7 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&&
     // This code path is inefficient: callers should implement dedicated
     // logic for non-object arrays.
   }
+
   if (PySequence_Check(obj)) {
     if (PyList_Check(obj) || PyTuple_Check(obj)) {
       // Use fast item access
@@ -101,7 +103,7 @@ inline Status VisitSequence(PyObject* obj, int64_t offset, VisitorFunc&& func) {
 template <class VisitorFunc>
 inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
                                   VisitorFunc&& func) {
-  if (PyArray_Check(mo)) {
+  if (has_numpy() && PyArray_Check(mo)) {
     PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
     if (PyArray_NDIM(mask) != 1) {
       return Status::Invalid("Mask must be 1D array");
diff --git a/python/pyarrow/src/arrow/python/init.cc b/python/pyarrow/src/arrow/python/numpy_init.cc
similarity index 78%
rename from python/pyarrow/src/arrow/python/init.cc
rename to python/pyarrow/src/arrow/python/numpy_init.cc
index dba293bbe23..96e2c7b7ccb 100644
--- a/python/pyarrow/src/arrow/python/init.cc
+++ b/python/pyarrow/src/arrow/python/numpy_init.cc
@@ -18,7 +18,16 @@
 // Trigger the array import (inversion of NO_IMPORT_ARRAY)
 #define NUMPY_IMPORT_ARRAY
 
-#include "arrow/python/init.h"
+#include "arrow/python/numpy_init.h"
 #include "arrow/python/numpy_interop.h"
 
-int arrow_init_numpy() { return arrow::py::import_numpy(); }
+namespace arrow::py {
+bool numpy_imported = false;
+
+int arrow_init_numpy() {
+  numpy_imported = true;
+  return arrow::py::import_numpy();
+}
+
+bool has_numpy() { return numpy_imported; }
+}  // namespace arrow::py
diff --git a/python/pyarrow/src/arrow/python/init.h b/python/pyarrow/src/arrow/python/numpy_init.h
similarity index 93%
rename from python/pyarrow/src/arrow/python/init.h
rename to python/pyarrow/src/arrow/python/numpy_init.h
index 2e6c954862b..36c544c1b51 100644
--- a/python/pyarrow/src/arrow/python/init.h
+++ b/python/pyarrow/src/arrow/python/numpy_init.h
@@ -20,7 +20,8 @@
 #include "arrow/python/platform.h"
 #include "arrow/python/visibility.h"
 
-extern "C" {
+namespace arrow::py {
 ARROW_PYTHON_EXPORT
 int arrow_init_numpy();
-}
+bool has_numpy();
+}  // namespace arrow::py
diff --git a/python/pyarrow/src/arrow/python/numpy_internal.h b/python/pyarrow/src/arrow/python/numpy_internal.h
index b9b632f9f9a..0b4d0be00e4 100644
--- a/python/pyarrow/src/arrow/python/numpy_internal.h
+++ b/python/pyarrow/src/arrow/python/numpy_internal.h
@@ -19,6 +19,7 @@
 
 #pragma once
 
+#include "arrow/python/numpy_init.h"
 #include "arrow/python/numpy_interop.h"
 
 #include "arrow/status.h"
@@ -155,15 +156,27 @@ inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) {
 namespace internal {
 
 inline bool PyFloatScalar_Check(PyObject* obj) {
-  return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating);
+  if (has_numpy()) {
+    return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating);
+  } else {
+    return PyFloat_Check(obj);
+  }
 }
 
 inline bool PyIntScalar_Check(PyObject* obj) {
-  return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer);
+  if (has_numpy()) {
+    return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer);
+  } else {
+    return PyLong_Check(obj);
+  }
 }
 
 inline bool PyBoolScalar_Check(PyObject* obj) {
-  return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool);
+  if (has_numpy()) {
+    return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool);
+  } else {
+    return PyBool_Check(obj);
+  }
 }
 
 static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
diff --git a/python/pyarrow/src/arrow/python/python_test.cc b/python/pyarrow/src/arrow/python/python_test.cc
index 746bf410911..eea6bf9459d 100644
--- a/python/pyarrow/src/arrow/python/python_test.cc
+++ b/python/pyarrow/src/arrow/python/python_test.cc
@@ -870,7 +870,7 @@ std::vector<TestCase> GetCppTestCases() {
        TestInferAllLeadingZerosExponentialNotationPositive},
       {"test_infer_all_leading_zeros_exponential_notation_negative",
        TestInferAllLeadingZerosExponentialNotationNegative},
-      {"test_object_block_write_fails", TestObjectBlockWriteFails},
+      {"test_object_block_write_fails_pandas_convert", TestObjectBlockWriteFails},
       {"test_mixed_type_fails", TestMixedTypeFails},
       {"test_from_python_decimal_rescale_not_truncateable",
        TestFromPythonDecimalRescaleNotTruncateable},
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index ce9e15c894c..e7195e99072 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -202,7 +202,7 @@ class PyValue {
       return true;
     } else if (obj == Py_False) {
       return false;
-    } else if (PyArray_IsScalar(obj, Bool)) {
+    } else if (has_numpy() && PyArray_IsScalar(obj, Bool)) {
       return reinterpret_cast<PyBoolScalarObject*>(obj)->obval == NPY_TRUE;
     } else {
       return internal::InvalidValue(obj, "tried to convert to boolean");
@@ -385,7 +385,7 @@ class PyValue {
         default:
           return Status::UnknownError("Invalid time unit");
       }
-    } else if (PyArray_CheckAnyScalarExact(obj)) {
+    } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
       // validate that the numpy scalar has np.datetime64 dtype
       ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
       if (!numpy_type->Equals(*type)) {
@@ -464,7 +464,7 @@ class PyValue {
         default:
           return Status::UnknownError("Invalid time unit");
       }
-    } else if (PyArray_CheckAnyScalarExact(obj)) {
+    } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
       // validate that the numpy scalar has np.datetime64 dtype
       ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
       if (!numpy_type->Equals(*type)) {
@@ -664,7 +664,7 @@ class PyPrimitiveConverter<
       ARROW_ASSIGN_OR_RAISE(
           auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
       // Numpy NaT sentinels can be checked after the conversion
-      if (PyArray_CheckAnyScalarExact(value) &&
+      if (has_numpy() && PyArray_CheckAnyScalarExact(value) &&
           PyValue::IsNaT(this->primitive_type_, converted)) {
         this->primitive_builder_->UnsafeAppendNull();
       } else {
@@ -804,8 +804,7 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
     if (PyValue::IsNull(this->options_, value)) {
       return this->list_builder_->AppendNull();
     }
-
-    if (PyArray_Check(value)) {
+    if (has_numpy() && PyArray_Check(value)) {
       RETURN_NOT_OK(AppendNdarray(value));
     } else if (PySequence_Check(value)) {
       RETURN_NOT_OK(AppendSequence(value));
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 6d34c71c9df..fff47373cb9 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -495,6 +495,9 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.to_numpy()
         array([  2,   2,   4,   4,   5, 100])
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         if zero_copy_only:
             raise ValueError(
                 "zero_copy_only must be False for pyarrow.ChunkedArray.to_numpy"
diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi
index 6fb4fc99d7c..3e0c63c18fc 100644
--- a/python/pyarrow/tensor.pxi
+++ b/python/pyarrow/tensor.pxi
@@ -107,6 +107,9 @@ strides: {0.strides}""".format(self)
         array([[  2,   2,   4],
                [  4,   5, 100]], dtype=int32)
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef PyObject* out
 
         check_status(TensorToNdarray(self.sp_tensor, self, &out))
@@ -478,6 +481,9 @@ shape: {0.shape}""".format(self)
         """
         Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy.
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef PyObject* out_data
         cdef PyObject* out_coords
 
@@ -743,6 +749,9 @@ shape: {0.shape}""".format(self)
         """
         Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy.
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef PyObject* out_data
         cdef PyObject* out_indptr
         cdef PyObject* out_indices
@@ -981,6 +990,9 @@ shape: {0.shape}""".format(self)
         """
         Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef PyObject* out_data
         cdef PyObject* out_indptr
         cdef PyObject* out_indices
@@ -1216,6 +1228,9 @@ shape: {0.shape}""".format(self)
         """
         Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef PyObject* out_data
         cdef PyObject* out_indptr
         cdef PyObject* out_indices
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 7a222cec8a7..0b82696d0a7 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -25,6 +25,7 @@
 
 import pytest
 import hypothesis as h
+
 from ..conftest import groups, defaults
 
 from pyarrow import set_timezone_db_path
diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py
index 6d91bad57ce..50da6693aff 100644
--- a/python/pyarrow/tests/interchange/test_conversion.py
+++ b/python/pyarrow/tests/interchange/test_conversion.py
@@ -16,11 +16,15 @@
 # under the License.
 
 from datetime import datetime as dt
-import numpy as np
 import pyarrow as pa
 from pyarrow.vendored.version import Version
 import pytest
 
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
 import pyarrow.interchange as pi
 from pyarrow.interchange.column import (
     _PyArrowColumn,
@@ -107,13 +111,13 @@ def test_offset_of_sliced_array():
     "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
 )
 @pytest.mark.parametrize(
-    "float, np_float", [
+    "float, np_float_str", [
         # (pa.float16(), np.float16),   #not supported by pandas
-        (pa.float32(), np.float32),
-        (pa.float64(), np.float64)
+        (pa.float32(), "float32"),
+        (pa.float64(), "float64")
     ]
 )
-def test_pandas_roundtrip(uint, int, float, np_float):
+def test_pandas_roundtrip(uint, int, float, np_float_str):
     if Version(pd.__version__) < Version("1.5.0"):
         pytest.skip("__dataframe__ added to pandas in 1.5.0")
 
@@ -122,7 +126,7 @@ def test_pandas_roundtrip(uint, int, float, np_float):
         {
             "a": pa.array(arr, type=uint),
             "b": pa.array(arr, type=int),
-            "c": pa.array(np.array(arr, dtype=np_float), type=float),
+            "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)), type=float),
             "d": [True, False, True],
         }
     )
@@ -326,13 +330,13 @@ def test_pandas_roundtrip_datetime(unit):
 
 @pytest.mark.pandas
 @pytest.mark.parametrize(
-    "np_float", [np.float32, np.float64]
+    "np_float_str", ["float32", "float64"]
 )
-def test_pandas_to_pyarrow_with_missing(np_float):
+def test_pandas_to_pyarrow_with_missing(np_float_str):
     if Version(pd.__version__) < Version("1.5.0"):
         pytest.skip("__dataframe__ added to pandas in 1.5.0")
 
-    np_array = np.array([0, np.nan, 2], dtype=np_float)
+    np_array = np.array([0, np.nan, 2], dtype=np.dtype(np_float_str))
     datetime_array = [None, dt(2007, 7, 14), dt(2007, 7, 15)]
     df = pd.DataFrame({
         # float, ColumnNullType.USE_NAN
@@ -364,6 +368,7 @@ def test_pandas_to_pyarrow_float16_with_missing():
         pi.from_dataframe(df)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(
     "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
 )
@@ -371,16 +376,16 @@ def test_pandas_to_pyarrow_float16_with_missing():
     "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
 )
 @pytest.mark.parametrize(
-    "float, np_float", [
-        (pa.float16(), np.float16),
-        (pa.float32(), np.float32),
-        (pa.float64(), np.float64)
+    "float, np_float_str", [
+        (pa.float16(), "float16"),
+        (pa.float32(), "float32"),
+        (pa.float64(), "float64")
     ]
 )
 @pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
 @pytest.mark.parametrize("tz", ['America/New_York', '+07:30', '-04:30'])
 @pytest.mark.parametrize("offset, length", [(0, 3), (0, 2), (1, 2), (2, 1)])
-def test_pyarrow_roundtrip(uint, int, float, np_float,
+def test_pyarrow_roundtrip(uint, int, float, np_float_str,
                            unit, tz, offset, length):
 
     from datetime import datetime as dt
@@ -391,7 +396,7 @@ def test_pyarrow_roundtrip(uint, int, float, np_float,
         {
             "a": pa.array(arr, type=uint),
             "b": pa.array(arr, type=int),
-            "c": pa.array(np.array(arr, dtype=np_float),
+            "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)),
                           type=float, from_pandas=True),
             "d": [True, False, True],
             "e": [True, False, None],
diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py
index 826089652bc..d060f7842c2 100644
--- a/python/pyarrow/tests/interchange/test_interchange_spec.py
+++ b/python/pyarrow/tests/interchange/test_interchange_spec.py
@@ -19,10 +19,13 @@
 import hypothesis as h
 import hypothesis.strategies as st
 
-import numpy as np
+import pytest
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pyarrow as pa
 import pyarrow.tests.strategies as past
-import pytest
 
 
 all_types = st.deferred(
@@ -39,6 +42,7 @@
 
 # datetime is tested in test_extra.py
 # dictionary is tested in test_categorical()
+@pytest.mark.numpy
 @h.given(past.arrays(all_types, size=3))
 def test_dtypes(arr):
     table = pa.table([arr], names=["a"])
@@ -51,6 +55,7 @@ def test_dtypes(arr):
     assert df.get_column(0).offset == 0
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(
     "uint, uint_bw",
     [
@@ -68,17 +73,17 @@ def test_dtypes(arr):
     ]
 )
 @pytest.mark.parametrize(
-    "float, float_bw, np_float", [
-        (pa.float16(), 16, np.float16),
-        (pa.float32(), 32, np.float32),
-        (pa.float64(), 64, np.float64)
+    "float, float_bw, np_float_str", [
+        (pa.float16(), 16, "float16"),
+        (pa.float32(), 32, "float32"),
+        (pa.float64(), 64, "float64")
     ]
 )
 @pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
 @pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30'])
 @pytest.mark.parametrize("use_batch", [False, True])
 def test_mixed_dtypes(uint, uint_bw, int, int_bw,
-                      float, float_bw, np_float, unit, tz,
+                      float, float_bw, np_float_str, unit, tz,
                       use_batch):
     from datetime import datetime as dt
     arr = [1, 2, 3]
@@ -87,7 +92,7 @@ def test_mixed_dtypes(uint, uint_bw, int, int_bw,
         {
             "a": pa.array(arr, type=uint),
             "b": pa.array(arr, type=int),
-            "c": pa.array(np.array(arr, dtype=np_float), type=float),
+            "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)), type=float),
             "d": [True, False, True],
             "e": ["a", "", "c"],
             "f": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))
@@ -200,16 +205,16 @@ def test_column_get_chunks(use_batch, size, n_chunks):
     "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
 )
 @pytest.mark.parametrize(
-    "float, np_float", [
-        (pa.float16(), np.float16),
-        (pa.float32(), np.float32),
-        (pa.float64(), np.float64)
+    "float, np_float_str", [
+        (pa.float16(), "float16"),
+        (pa.float32(), "float32"),
+        (pa.float64(), "float64")
     ]
 )
 @pytest.mark.parametrize("use_batch", [False, True])
-def test_get_columns(uint, int, float, np_float, use_batch):
+def test_get_columns(uint, int, float, np_float_str, use_batch):
     arr = [[1, 2, 3], [4, 5]]
-    arr_float = np.array([1, 2, 3, 4, 5], dtype=np_float)
+    arr_float = np.array([1, 2, 3, 4, 5], dtype=np.dtype(np_float_str))
     table = pa.table(
         {
             "a": pa.chunked_array(arr, type=uint),
diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py
index b4a57ba0b15..fd6ad94fbd6 100644
--- a/python/pyarrow/tests/parquet/common.py
+++ b/python/pyarrow/tests/parquet/common.py
@@ -17,7 +17,10 @@
 
 import io
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 from pyarrow.tests import util
diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
index 194af7415e8..6496aa99092 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -22,7 +22,6 @@
 from shutil import copytree
 from decimal import Decimal
 
-import numpy as np
 import pytest
 
 import pyarrow as pa
@@ -47,6 +46,10 @@
 except ImportError:
     pd = tm = None
 
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 # Marks all of the tests in this module
 # Ignore these with pytest ... -m 'not parquet'
diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py
index e6b66b00428..79dd9694826 100644
--- a/python/pyarrow/tests/parquet/test_data_types.py
+++ b/python/pyarrow/tests/parquet/test_data_types.py
@@ -17,8 +17,12 @@
 
 import decimal
 import io
+import random
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
@@ -173,6 +177,7 @@ def test_direct_read_dictionary_subfield():
     assert result[0].num_chunks == 1
 
 
+@pytest.mark.numpy
 def test_dictionary_array_automatically_read():
     # ARROW-3246
 
@@ -334,10 +339,10 @@ def test_column_of_lists(tempdir):
 def test_large_list_records():
     # This was fixed in PARQUET-1100
 
-    list_lengths = np.random.randint(0, 500, size=50)
-    list_lengths[::10] = 0
+    list_lengths = [random.randint(0, 500) for _ in range(50)]
+    list_lengths[::10] = [0, 0, 0, 0, 0]
 
-    list_values = [list(map(int, np.random.randint(0, 100, size=x)))
+    list_values = [list(map(int, [random.randint(0, 100) for _ in range(x)]))
                    if i % 8 else None
                    for i, x in enumerate(list_lengths)]
 
diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
index 47e608a1404..f68f1aa9cdb 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -20,7 +20,10 @@
 import os
 import pathlib
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 import unittest.mock as mock
 
diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py
index 08fb1098322..b89fd97cb91 100644
--- a/python/pyarrow/tests/parquet/test_datetime.py
+++ b/python/pyarrow/tests/parquet/test_datetime.py
@@ -19,7 +19,10 @@
 import io
 import warnings
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py
index c29213ebc3d..14ce9bbfcdd 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -20,7 +20,10 @@
 from collections import OrderedDict
 import io
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
@@ -584,7 +587,7 @@ def test_table_large_metadata():
     my_schema = pa.schema([pa.field('f0', 'double')],
                           metadata={'large': 'x' * 10000000})
 
-    table = pa.table([np.arange(10)], schema=my_schema)
+    table = pa.table([range(10)], schema=my_schema)
     _check_roundtrip(table)
 
 
diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
index b5913bf5c6b..2ea2f46873a 100644
--- a/python/pyarrow/tests/parquet/test_pandas.py
+++ b/python/pyarrow/tests/parquet/test_pandas.py
@@ -18,7 +18,10 @@
 import io
 import json
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index db0aa139712..7a1b31a4d9d 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -21,7 +21,10 @@
 import pytest
 import hypothesis as h
 import hypothesis.strategies as st
-import hypothesis.extra.numpy as npst
+try:
+    import hypothesis.extra.numpy as npst
+except ImportError:
+    npst = None
 try:
     import hypothesis.extra.pytz as tzst
 except ImportError:
@@ -35,7 +38,10 @@
         import tzdata  # noqa:F401
     except ImportError:
         zoneinfo = None
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 
diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py
index cd381cf427d..76a766984da 100644
--- a/python/pyarrow/tests/test_adhoc_memory_leak.py
+++ b/python/pyarrow/tests/test_adhoc_memory_leak.py
@@ -17,7 +17,10 @@
 
 import pytest
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pyarrow as pa
 
 import pyarrow.tests.util as test_util
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index c44ec3f8e1a..4160d648294 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -27,7 +27,10 @@
 import sys
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 import pyarrow.tests.strategies as past
@@ -157,6 +160,7 @@ def test_binary_total_values_length():
     assert large_arr.slice(1, 3).total_values_length == 11
 
 
+@pytest.mark.numpy
 def test_to_numpy_zero_copy():
     arr = pa.array(range(10))
 
@@ -176,6 +180,7 @@ def test_to_numpy_zero_copy():
     np.testing.assert_array_equal(np_arr, expected)
 
 
+@pytest.mark.numpy
 def test_chunked_array_to_numpy_zero_copy():
     elements = [[2, 2, 4], [4, 5, 100]]
 
@@ -191,6 +196,7 @@ def test_chunked_array_to_numpy_zero_copy():
     np.testing.assert_array_equal(np_arr, expected)
 
 
+@pytest.mark.numpy
 def test_to_numpy_unsupported_types():
     # ARROW-2871: Some primitive types are not yet supported in to_numpy
     bool_arr = pa.array([True, False, True])
@@ -217,6 +223,7 @@ def test_to_numpy_unsupported_types():
         arr.to_numpy()
 
 
+@pytest.mark.numpy
 def test_to_numpy_writable():
     arr = pa.array(range(10))
     np_arr = arr.to_numpy()
@@ -234,6 +241,7 @@ def test_to_numpy_writable():
         arr.to_numpy(zero_copy_only=True, writable=True)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
 @pytest.mark.parametrize('tz', [None, "UTC"])
 def test_to_numpy_datetime64(unit, tz):
@@ -243,6 +251,7 @@ def test_to_numpy_datetime64(unit, tz):
     np.testing.assert_array_equal(np_arr, expected)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
 def test_to_numpy_timedelta64(unit):
     arr = pa.array([1, 2, 3], pa.duration(unit))
@@ -251,6 +260,7 @@ def test_to_numpy_timedelta64(unit):
     np.testing.assert_array_equal(np_arr, expected)
 
 
+@pytest.mark.numpy
 def test_to_numpy_dictionary():
     # ARROW-7591
     arr = pa.array(["a", "b", "a"]).dictionary_encode()
@@ -427,6 +437,11 @@ def test_array_getitem():
         with pytest.raises(IndexError):
             arr[idx]
 
+
+@pytest.mark.numpy
+def test_array_getitem_numpy_scalars():
+    arr = pa.array(range(10, 15))
+    lst = arr.to_pylist()
     # check that numpy scalars are supported
     for idx in range(-len(arr), len(arr)):
         assert arr[np.int32(idx)].as_py() == lst[idx]
@@ -469,9 +484,11 @@ def test_array_slice():
             res.validate()
             expected = arr.to_pylist()[start:stop]
             assert res.to_pylist() == expected
-            assert res.to_numpy().tolist() == expected
+            if np is not None:
+                assert res.to_numpy().tolist() == expected
 
 
+@pytest.mark.numpy
 def test_array_slice_negative_step():
     # ARROW-2714
     np_arr = np.arange(20)
@@ -542,6 +559,7 @@ def test_struct_array_slice():
                                    {'a': 5, 'b': 6.5}]
 
 
+@pytest.mark.numpy
 def test_array_factory_invalid_type():
 
     class MyObject:
@@ -552,6 +570,7 @@ class MyObject:
         pa.array(arr)
 
 
+@pytest.mark.numpy
 def test_array_ref_to_ndarray_base():
     arr = np.array([1, 2, 3])
 
@@ -576,6 +595,7 @@ def test_array_eq():
     assert (arr1 == None) is False  # noqa: E711
 
 
+@pytest.mark.numpy
 def test_array_from_buffers():
     values_buf = pa.py_buffer(np.int16([4, 5, 6, 7]))
     nulls_buf = pa.py_buffer(np.uint8([0b00001101]))
@@ -773,6 +793,7 @@ def test_dictionary_from_buffers(offset):
     assert a[offset:] == b
 
 
+@pytest.mark.numpy
 def test_dictionary_from_numpy():
     indices = np.repeat([0, 1, 2], 2)
     dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
@@ -795,6 +816,7 @@ def test_dictionary_from_numpy():
             assert d2[i].as_py() == dictionary[indices[i]]
 
 
+@pytest.mark.numpy
 def test_dictionary_to_numpy():
     expected = pa.array(
         ["foo", "bar", None, "foo"]
@@ -865,6 +887,7 @@ def test_dictionary_to_numpy():
     )
 
 
+@pytest.mark.numpy
 def test_dictionary_from_boxed_arrays():
     indices = np.repeat([0, 1, 2], 2)
     dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
@@ -910,6 +933,7 @@ def test_dictionary_indices():
     arr.indices.validate(full=True)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(('list_array_type', 'list_type_factory'),
                          [(pa.ListArray, pa.list_),
                           (pa.LargeListArray, pa.large_list)])
@@ -1052,6 +1076,7 @@ def test_map_from_dict():
     assert tup_arr.equals(dict_arr)
 
 
+@pytest.mark.numpy
 def test_map_from_arrays():
     offsets_arr = np.array([0, 2, 5, 8], dtype='i4')
     offsets = pa.array(offsets_arr, type='int32')
@@ -1472,6 +1497,7 @@ def _check_cast_case(case, *, safe=True, check_array_construction=True):
         assert in_arr.equals(expected)
 
 
+@pytest.mark.numpy
 def test_cast_integers_safe():
     safe_cases = [
         (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
@@ -1558,6 +1584,7 @@ def test_chunked_array_data_warns():
     assert isinstance(res, pa.ChunkedArray)
 
 
+@pytest.mark.numpy
 def test_cast_integers_unsafe():
     # We let NumPy do the unsafe casting.
     # Note that NEP50 in the NumPy spec no longer allows
@@ -1578,6 +1605,7 @@ def test_cast_integers_unsafe():
         _check_cast_case(case, safe=False)
 
 
+@pytest.mark.numpy
 def test_floating_point_truncate_safe():
     safe_cases = [
         (np.array([1.0, 2.0, 3.0], dtype='float32'), 'float32',
@@ -1591,6 +1619,7 @@ def test_floating_point_truncate_safe():
         _check_cast_case(case, safe=True)
 
 
+@pytest.mark.numpy
 def test_floating_point_truncate_unsafe():
     unsafe_cases = [
         (np.array([1.1, 2.2, 3.3], dtype='float32'), 'float32',
@@ -1635,6 +1664,7 @@ def test_decimal_to_int_safe():
         _check_cast_case(case, safe=True)
 
 
+@pytest.mark.numpy
 def test_decimal_to_int_value_out_of_bounds():
     out_of_bounds_cases = [
         (
@@ -1735,6 +1765,7 @@ def test_decimal_to_decimal():
         result = arr.cast(pa.decimal128(5, 2))
 
 
+@pytest.mark.numpy
 def test_safe_cast_nan_to_int_raises():
     arr = pa.array([np.nan, 1.])
 
@@ -1742,6 +1773,7 @@ def test_safe_cast_nan_to_int_raises():
         arr.cast(pa.int64(), safe=True)
 
 
+@pytest.mark.numpy
 def test_cast_signed_to_unsigned():
     safe_cases = [
         (np.array([0, 1, 2, 3], dtype='i1'), pa.uint8(),
@@ -1992,6 +2024,7 @@ def test_dictionary_decode():
         assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_cast_time32_to_int():
     arr = pa.array(np.array([0, 1, 2], dtype='int32'),
                    type=pa.time32('s'))
@@ -2001,6 +2034,7 @@ def test_cast_time32_to_int():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_cast_time64_to_int():
     arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                    type=pa.time64('us'))
@@ -2010,6 +2044,7 @@ def test_cast_time64_to_int():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_cast_timestamp_to_int():
     arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                    type=pa.timestamp('us'))
@@ -2035,6 +2070,7 @@ def test_cast_date32_to_int():
     assert result2.equals(arr)
 
 
+@pytest.mark.numpy
 def test_cast_duration_to_int():
     arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                    type=pa.duration('us'))
@@ -2044,6 +2080,7 @@ def test_cast_duration_to_int():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_cast_binary_to_utf8():
     binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary())
     utf8_arr = binary_arr.cast(pa.utf8())
@@ -2064,6 +2101,7 @@ def test_cast_binary_to_utf8():
     assert casted.null_count == 1
 
 
+@pytest.mark.numpy
 def test_cast_date64_to_int():
     arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                    type=pa.date64())
@@ -2146,6 +2184,7 @@ def test_array_pickle_dictionary(pickle_module):
         assert array.equals(result)
 
 
+@pytest.mark.numpy
 @h.settings(suppress_health_check=(h.HealthCheck.too_slow,))
 @h.given(
     past.arrays(
@@ -2177,9 +2216,9 @@ def test_array_pickle_protocol5(data, typ, pickle_module):
         assert result_addresses == addresses
 
 
-@pytest.mark.parametrize(
-    'narr',
-    [
+@pytest.mark.numpy
+def test_to_numpy_roundtrip():
+    for narr in [
         np.arange(10, dtype=np.int64),
         np.arange(10, dtype=np.int32),
         np.arange(10, dtype=np.int16),
@@ -2191,23 +2230,23 @@ def test_array_pickle_protocol5(data, typ, pickle_module):
         np.arange(10, dtype=np.float64),
         np.arange(10, dtype=np.float32),
         np.arange(10, dtype=np.float16),
-    ]
-)
-def test_to_numpy_roundtrip(narr):
-    arr = pa.array(narr)
-    assert narr.dtype == arr.to_numpy().dtype
-    np.testing.assert_array_equal(narr, arr.to_numpy())
-    np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
-    np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
-    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())
+    ]:
+        arr = pa.array(narr)
+        assert narr.dtype == arr.to_numpy().dtype
+        np.testing.assert_array_equal(narr, arr.to_numpy())
+        np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
+        np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
+        np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())
 
 
+@pytest.mark.numpy
 def test_array_uint64_from_py_over_range():
     arr = pa.array([2 ** 63], type=pa.uint64())
     expected = pa.array(np.array([2 ** 63], dtype='u8'))
     assert arr.equals(expected)
 
 
+@pytest.mark.numpy
 def test_array_conversions_no_sentinel_values():
     arr = np.array([1, 2, 3, 4], dtype='int8')
     refcount = sys.getrefcount(arr)
@@ -2249,6 +2288,7 @@ def test_time32_time64_from_integer():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_binary_string_pandas_null_sentinels():
     # ARROW-6227
     def _check_case(ty):
@@ -2259,6 +2299,7 @@ def _check_case(ty):
     _check_case('utf8')
 
 
+@pytest.mark.numpy
 def test_pandas_null_sentinels_raise_error():
     # ARROW-6227
     cases = [
@@ -2299,6 +2340,7 @@ def test_pandas_null_sentinels_index():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_array_roundtrip_from_numpy_datetimeD():
     arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]')
 
@@ -2319,6 +2361,7 @@ def test_array_from_naive_datetimes():
     assert arr.type == pa.timestamp('us', tz=None)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(('dtype', 'type'), [
     ('datetime64[s]', pa.timestamp('s')),
     ('datetime64[ms]', pa.timestamp('ms')),
@@ -2342,6 +2385,7 @@ def test_array_from_numpy_datetime(dtype, type):
     assert arr.equals(expected)
 
 
+@pytest.mark.numpy
 def test_array_from_different_numpy_datetime_units_raises():
     data = [
         None,
@@ -2356,6 +2400,7 @@ def test_array_from_different_numpy_datetime_units_raises():
         pa.array(data)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('unit', ['ns', 'us', 'ms', 's'])
 def test_array_from_list_of_timestamps(unit):
     n = np.datetime64('NaT', unit)
@@ -2370,6 +2415,7 @@ def test_array_from_list_of_timestamps(unit):
     assert a1[0] == a2[0]
 
 
+@pytest.mark.numpy
 def test_array_from_timestamp_with_generic_unit():
     n = np.datetime64('NaT')
     x = np.datetime64('2017-01-01 01:01:01.111111111')
@@ -2380,6 +2426,7 @@ def test_array_from_timestamp_with_generic_unit():
         pa.array([n, x, y])
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(('dtype', 'type'), [
     ('timedelta64[s]', pa.duration('s')),
     ('timedelta64[ms]', pa.duration('ms')),
@@ -2408,6 +2455,7 @@ def test_array_from_numpy_timedelta(dtype, type):
     assert arr.to_pylist() == data
 
 
+@pytest.mark.numpy
 def test_array_from_numpy_timedelta_incorrect_unit():
     # generic (no unit)
     td = np.timedelta64(1)
@@ -2423,6 +2471,7 @@ def test_array_from_numpy_timedelta_incorrect_unit():
             pa.array(data)
 
 
+@pytest.mark.numpy
 def test_array_from_numpy_ascii():
     arr = np.array(['abcde', 'abc', ''], dtype='|S5')
 
@@ -2567,6 +2616,7 @@ def test_interval_array_from_dateoffset():
     assert list(actual_list[0]) == expected_from_pandas
 
 
+@pytest.mark.numpy
 def test_array_from_numpy_unicode():
     dtypes = ['<U5', '>U5']
 
@@ -2599,12 +2649,14 @@ def test_array_from_numpy_unicode():
     assert arrow_arr.equals(expected)
 
 
+@pytest.mark.numpy
 def test_array_string_from_non_string():
     # ARROW-5682 - when converting to string raise on non string-like dtype
     with pytest.raises(TypeError):
         pa.array(np.array([1, 2, 3]), type=pa.string())
 
 
+@pytest.mark.numpy
 def test_array_string_from_all_null():
     # ARROW-5682
     vals = np.array([None, None], dtype=object)
@@ -2619,6 +2671,7 @@ def test_array_string_from_all_null():
     assert arr.null_count == 2
 
 
+@pytest.mark.numpy
 def test_array_from_masked():
     ma = np.ma.array([1, 2, 3, 4], dtype='int64',
                      mask=[False, False, True, False])
@@ -2630,6 +2683,7 @@ def test_array_from_masked():
         pa.array(ma, mask=np.array([True, False, False, False]))
 
 
+@pytest.mark.numpy
 def test_array_from_shrunken_masked():
     ma = np.ma.array([0], dtype='int64')
     result = pa.array(ma)
@@ -2637,6 +2691,7 @@ def test_array_from_shrunken_masked():
     assert expected.equals(result)
 
 
+@pytest.mark.numpy
 def test_array_from_invalid_dim_raises():
     msg = "only handle 1-dimensional arrays"
     arr2d = np.array([[1, 2, 3], [4, 5, 6]])
@@ -2648,6 +2703,7 @@ def test_array_from_invalid_dim_raises():
         pa.array(arr0d)
 
 
+@pytest.mark.numpy
 def test_array_from_strided_bool():
     # ARROW-6325
     arr = np.ones((3, 2), dtype=bool)
@@ -2659,6 +2715,7 @@ def test_array_from_strided_bool():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_array_from_strided():
     pydata = [
         ([b"ab", b"cd", b"ef"], (pa.binary(), pa.binary(2))),
@@ -2683,6 +2740,7 @@ def test_boolean_true_count_false_count():
     assert arr.false_count == 1000
 
 
+@pytest.mark.numpy
 def test_buffers_primitive():
     a = pa.array([1, 2, None, 4], type=pa.int16())
     buffers = a.buffers()
@@ -2755,6 +2813,7 @@ def test_buffers_nested():
     assert struct.unpack('4xh', values) == (43,)
 
 
+@pytest.mark.numpy
 def test_total_buffer_size():
     a = pa.array(np.array([4, 5, 6], dtype='int64'))
     assert a.nbytes == 8 * 3
@@ -3153,6 +3212,7 @@ def test_nested_dictionary_array():
     assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a']
 
 
+@pytest.mark.numpy
 def test_array_from_numpy_str_utf8():
     # ARROW-3890 -- in Python 3, NPY_UNICODE arrays are produced, but in Python
     # 2 they are NPY_STRING (binary), so we must do UTF-8 validation
@@ -3179,6 +3239,7 @@ def test_array_from_numpy_str_utf8():
         pa.array(vec, pa.string(), mask=np.array([False]))
 
 
+@pytest.mark.numpy
 @pytest.mark.slow
 @pytest.mark.large_memory
 def test_numpy_binary_overflow_to_chunked():
@@ -3237,6 +3298,7 @@ def test_list_child_overflow_to_chunked():
     assert len(arr.chunk(1)) == 1
 
 
+@pytest.mark.numpy
 def test_infer_type_masked():
     # ARROW-5208
     ty = pa.infer_type(['foo', 'bar', None, 2],
@@ -3252,6 +3314,7 @@ def test_infer_type_masked():
     assert pa.infer_type([], mask=[]) == pa.null()
 
 
+@pytest.mark.numpy
 def test_array_masked():
     # ARROW-5208
     arr = pa.array([4, None, 4, 3.],
@@ -3264,6 +3327,7 @@ def test_array_masked():
     assert arr.type == pa.int64()
 
 
+@pytest.mark.numpy
 def test_array_supported_masks():
     # ARROW-13883
     arr = pa.array([4, None, 4, 3.],
@@ -3322,6 +3386,7 @@ def test_array_supported_pandas_masks():
     assert arr.to_pylist() == [None, 1]
 
 
+@pytest.mark.numpy
 def test_binary_array_masked():
     # ARROW-12431
     masked_basic = pa.array([b'\x05'], type=pa.binary(1),
@@ -3354,6 +3419,7 @@ def test_binary_array_masked():
     assert ([b'aaa', b'bbb', b'ccc']*10) == arrow_array.to_pylist()
 
 
+@pytest.mark.numpy
 def test_binary_array_strided():
     # Masked
     nparray = np.array([b"ab", b"cd", b"ef"])
@@ -3367,6 +3433,7 @@ def test_binary_array_strided():
     assert [b"ab", b"ef"] == arrow_array.to_pylist()
 
 
+@pytest.mark.numpy
 def test_array_invalid_mask_raises():
     # ARROW-10742
     cases = [
@@ -3400,6 +3467,7 @@ def test_array_from_large_pyints():
         pa.array([int(2 ** 63)])
 
 
+@pytest.mark.numpy
 def test_numpy_array_protocol():
     # test the __array__ method on pyarrow.Array
     arr = pa.array([1, 2, 3])
@@ -3446,6 +3514,7 @@ def test_numpy_array_protocol():
     assert result.dtype == "float64"
 
 
+@pytest.mark.numpy
 def test_array_protocol():
 
     class MyArray:
@@ -3769,6 +3838,7 @@ def test_run_end_encoded_from_buffers():
                                            1, offset, children)
 
 
+@pytest.mark.numpy
 def test_run_end_encoded_from_array_with_type():
     run_ends = [1, 3, 6]
     values = [1, 2, 3]
@@ -3808,6 +3878,7 @@ def test_run_end_encoded_from_array_with_type():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_run_end_encoded_to_numpy():
     arr = [1, 2, 2, 3, 3, 3]
     ree_array = pa.array(arr, pa.run_end_encoded(pa.int32(), pa.int64()))
@@ -4023,6 +4094,7 @@ def test_list_view_slice(list_view_type):
     assert sliced_array[0].as_py() == sliced_array.values[i:j].to_pylist() == [4]
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('numpy_native_dtype', ['u2', 'i4', 'f8'])
 def test_swapped_byte_order_fails(numpy_native_dtype):
     # ARROW-39129
diff --git a/python/pyarrow/tests/test_builder.py b/python/pyarrow/tests/test_builder.py
index abc8a0013df..9187a19b5fc 100644
--- a/python/pyarrow/tests/test_builder.py
+++ b/python/pyarrow/tests/test_builder.py
@@ -15,10 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import math
 import weakref
 
-import numpy as np
-
 import pyarrow as pa
 from pyarrow.lib import StringBuilder, StringViewBuilder
 
@@ -35,7 +34,7 @@ def test_string_builder_append():
     sbuilder = StringBuilder()
     sbuilder.append(b"a byte string")
     sbuilder.append("a string")
-    sbuilder.append(np.nan)
+    sbuilder.append(math.nan)
     sbuilder.append(None)
     assert len(sbuilder) == 4
     assert sbuilder.null_count == 2
@@ -50,7 +49,7 @@ def test_string_builder_append():
 
 def test_string_builder_append_values():
     sbuilder = StringBuilder()
-    sbuilder.append_values([np.nan, None, "text", None, "other text"])
+    sbuilder.append_values([math.nan, None, "text", None, "other text"])
     assert sbuilder.null_count == 3
     arr = sbuilder.finish()
     assert arr.null_count == 3
@@ -60,7 +59,7 @@ def test_string_builder_append_values():
 
 def test_string_builder_append_after_finish():
     sbuilder = StringBuilder()
-    sbuilder.append_values([np.nan, None, "text", None, "other text"])
+    sbuilder.append_values([math.nan, None, "text", None, "other text"])
     arr = sbuilder.finish()
     sbuilder.append("No effect")
     expected = [None, None, "text", None, "other text"]
@@ -72,7 +71,7 @@ def test_string_view_builder():
     builder.append(b"a byte string")
     builder.append("a string")
     builder.append("a longer not-inlined string")
-    builder.append(np.nan)
+    builder.append(math.nan)
     builder.append_values([None, "text"])
     assert len(builder) == 6
     assert builder.null_count == 2
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 64fe7f1deb5..d4307cd24f8 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -28,7 +28,10 @@
 import sys
 import textwrap
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 try:
     import pandas as pd
@@ -44,27 +47,6 @@
 except ImportError:
     pas = None
 
-all_array_types = [
-    ('bool', [True, False, False, True, True]),
-    ('uint8', np.arange(5)),
-    ('int8', np.arange(5)),
-    ('uint16', np.arange(5)),
-    ('int16', np.arange(5)),
-    ('uint32', np.arange(5)),
-    ('int32', np.arange(5)),
-    ('uint64', np.arange(5, 10)),
-    ('int64', np.arange(5, 10)),
-    ('float', np.arange(0, 0.5, 0.1)),
-    ('double', np.arange(0, 0.5, 0.1)),
-    ('string', ['a', 'b', None, 'ddd', 'ee']),
-    ('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
-    (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
-    (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
-    (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
-    (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [
-        {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]),
-]
-
 exported_functions = [
     func for (name, func) in sorted(pc.__dict__.items())
     if hasattr(func, '__arrow_compute_function__')]
@@ -87,6 +69,28 @@
 ]
 
 
+all_array_types = [
+    ('bool', [True, False, False, True, True]),
+    ('uint8', range(5)),
+    ('int8', range(5)),
+    ('uint16', range(5)),
+    ('int16', range(5)),
+    ('uint32', range(5)),
+    ('int32', range(5)),
+    ('uint64', range(5, 10)),
+    ('int64', range(5, 10)),
+    ('float', [0, 0.1, 0.2, 0.3, 0.4]),
+    ('double', [0, 0.1, 0.2, 0.3, 0.4]),
+    ('string', ['a', 'b', None, 'ddd', 'ee']),
+    ('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
+    (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
+    (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
+    (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
+    (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [
+        {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]),
+]
+
+
 def test_exported_functions():
     # Check that all exported concrete functions can be called with
     # the right number of arguments.
@@ -263,6 +267,7 @@ def test_get_function_hash_aggregate():
                         pc.HashAggregateKernel, 1)
 
 
+@pytest.mark.numpy
 def test_call_function_with_memory_pool():
     arr = pa.array(["foo", "bar", "baz"])
     indices = np.array([2, 2, 1])
@@ -1172,7 +1177,7 @@ def test_take_on_chunked_array():
         ]
     ])
 
-    indices = np.array([0, 5, 1, 6, 9, 2])
+    indices = pa.array([0, 5, 1, 6, 9, 2])
     result = arr.take(indices)
     expected = pa.chunked_array([["a", "f", "b", "g", "j", "c"]])
     assert result.equals(expected)
@@ -1304,12 +1309,6 @@ def test_filter(ty, values):
     result.validate()
     assert result.equals(pa.array([values[0], values[3], None], type=ty))
 
-    # same test with different array type
-    mask = np.array([True, False, False, True, None])
-    result = arr.filter(mask, null_selection_behavior='drop')
-    result.validate()
-    assert result.equals(pa.array([values[0], values[3]], type=ty))
-
     # non-boolean dtype
     mask = pa.array([0, 1, 0, 1, 0])
     with pytest.raises(NotImplementedError):
@@ -1321,6 +1320,17 @@ def test_filter(ty, values):
         arr.filter(mask)
 
 
+@pytest.mark.numpy
+@pytest.mark.parametrize(('ty', 'values'), all_array_types)
+def test_filter_numpy_array_mask(ty, values):
+    arr = pa.array(values, type=ty)
+    # same test as test_filter with different array type
+    mask = np.array([True, False, False, True, None])
+    result = arr.filter(mask, null_selection_behavior='drop')
+    result.validate()
+    assert result.equals(pa.array([values[0], values[3]], type=ty))
+
+
 def test_filter_chunked_array():
     arr = pa.chunked_array([["a", None], ["c", "d", "e"]])
     expected_drop = pa.chunked_array([["a"], ["e"]])
@@ -1586,9 +1596,11 @@ def test_round_to_integer(ty):
     for round_mode, expected in rmode_and_expected.items():
         options = RoundOptions(round_mode=round_mode)
         result = round(values, options=options)
-        np.testing.assert_array_equal(result, pa.array(expected))
+        expected_array = pa.array(expected, type=pa.float64())
+        assert expected_array.equals(result)
 
 
+@pytest.mark.numpy
 def test_round():
     values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
     ndigits_and_expected = {
@@ -1607,6 +1619,7 @@ def test_round():
         assert pc.round(values, ndigits, "half_towards_infinity") == result
 
 
+@pytest.mark.numpy
 def test_round_to_multiple():
     values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
     multiple_and_expected = {
@@ -1670,7 +1683,7 @@ def test_is_null():
     expected = pa.chunked_array([[True, True], [True, False]])
     assert result.equals(expected)
 
-    arr = pa.array([1, 2, 3, None, np.nan])
+    arr = pa.array([1, 2, 3, None, float("nan")])
     result = arr.is_null()
     expected = pa.array([False, False, False, True, False])
     assert result.equals(expected)
@@ -1681,7 +1694,7 @@ def test_is_null():
 
 
 def test_is_nan():
-    arr = pa.array([1, 2, 3, None, np.nan])
+    arr = pa.array([1, 2, 3, None, float("nan")])
     result = arr.is_nan()
     expected = pa.array([False, False, False, None, True])
     assert result.equals(expected)
@@ -1986,6 +1999,7 @@ def check_cast_float_to_decimal(float_ty, float_val, decimal_ty, decimal_ctx,
 
 
 # Cannot test float32 as case generators above assume float64
+@pytest.mark.numpy
 @pytest.mark.parametrize('float_ty', [pa.float64()], ids=str)
 @pytest.mark.parametrize('decimal_ty', decimal_type_traits,
                          ids=lambda v: v.name)
@@ -2003,6 +2017,7 @@ def test_cast_float_to_decimal(float_ty, decimal_ty, case_generator):
                 ctx, decimal_ty.max_precision)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('float_ty', [pa.float32(), pa.float64()], ids=str)
 @pytest.mark.parametrize('decimal_traits', decimal_type_traits,
                          ids=lambda v: v.name)
@@ -2908,6 +2923,7 @@ def test_min_max_element_wise():
     assert result == pa.array([1, 2, None])
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('start', (1.25, 10.5, -10.5))
 @pytest.mark.parametrize('skip_nulls', (True, False))
 def test_cumulative_sum(start, skip_nulls):
@@ -2962,6 +2978,7 @@ def test_cumulative_sum(start, skip_nulls):
             pc.cumulative_sum([1, 2, 3], start=strt)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('start', (1.25, 10.5, -10.5))
 @pytest.mark.parametrize('skip_nulls', (True, False))
 def test_cumulative_prod(start, skip_nulls):
@@ -3016,6 +3033,7 @@ def test_cumulative_prod(start, skip_nulls):
             pc.cumulative_prod([1, 2, 3], start=strt)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('start', (0.5, 3.5, 6.5))
 @pytest.mark.parametrize('skip_nulls', (True, False))
 def test_cumulative_max(start, skip_nulls):
@@ -3073,6 +3091,7 @@ def test_cumulative_max(start, skip_nulls):
             pc.cumulative_max([1, 2, 3], start=strt)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('start', (0.5, 3.5, 6.5))
 @pytest.mark.parametrize('skip_nulls', (True, False))
 def test_cumulative_min(start, skip_nulls):
@@ -3407,6 +3426,7 @@ def create_sample_expressions():
 # Tests the Arrow-specific serialization mechanism
 
 
+@pytest.mark.numpy
 def test_expression_serialization_arrow(pickle_module):
     for expr in create_sample_expressions()["all"]:
         assert isinstance(expr, pc.Expression)
@@ -3414,6 +3434,7 @@ def test_expression_serialization_arrow(pickle_module):
         assert expr.equals(restored)
 
 
+@pytest.mark.numpy
 @pytest.mark.substrait
 def test_expression_serialization_substrait():
 
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 6140163a8ee..c3589877e64 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -23,8 +23,11 @@
 import re
 
 import hypothesis as h
-import numpy as np
 import pytest
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 from pyarrow.pandas_compat import _pandas_api  # noqa
 import pyarrow as pa
@@ -32,17 +35,17 @@
 
 
 int_type_pairs = [
-    (np.int8, pa.int8()),
-    (np.int16, pa.int16()),
-    (np.int32, pa.int32()),
-    (np.int64, pa.int64()),
-    (np.uint8, pa.uint8()),
-    (np.uint16, pa.uint16()),
-    (np.uint32, pa.uint32()),
-    (np.uint64, pa.uint64())]
+    ("int8", pa.int8()),
+    ("int16", pa.int16()),
+    ("int32", pa.int32()),
+    ("int64", pa.int64()),
+    ("uint8", pa.uint8()),
+    ("uint16", pa.uint16()),
+    ("uint32", pa.uint32()),
+    ("uint64", pa.uint64())]
 
 
-np_int_types, pa_int_types = zip(*int_type_pairs)
+np_str_int_types, pa_int_types = zip(*int_type_pairs)
 
 
 class StrangeIterable:
@@ -174,7 +177,9 @@ def _as_set(xs):
     return set(xs)
 
 
-SEQUENCE_TYPES = [_as_list, _as_tuple, _as_numpy_array]
+SEQUENCE_TYPES = [_as_list, _as_tuple]
+if np is not None:
+    SEQUENCE_TYPES.append(_as_numpy_array)
 ITERABLE_TYPES = [_as_set, _as_dict_values] + SEQUENCE_TYPES
 COLLECTIONS_TYPES = [_as_deque] + ITERABLE_TYPES
 
@@ -217,6 +222,7 @@ def test_sequence_boolean(seq):
     assert arr.to_pylist() == expected
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 def test_sequence_numpy_boolean(seq):
     expected = [np.bool_(True), None, np.bool_(False), None]
@@ -225,6 +231,7 @@ def test_sequence_numpy_boolean(seq):
     assert arr.to_pylist() == [True, None, False, None]
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 def test_sequence_mixed_numpy_python_bools(seq):
     values = np.array([True, False])
@@ -278,11 +285,14 @@ def test_list_with_non_list(seq):
 
 
 @parametrize_with_sequence_types
+@pytest.mark.parametrize(
+    "inner_seq", SEQUENCE_TYPES
+)
 @pytest.mark.parametrize("factory", [
     pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
-def test_nested_arrays(seq, factory):
-    arr = pa.array(seq([np.array([], dtype=np.int64),
-                        np.array([1, 2], dtype=np.int64), None]),
+def test_nested_arrays(seq, inner_seq, factory):
+    arr = pa.array(seq([inner_seq([]),
+                        inner_seq([1, 2]), None]),
                    type=factory(pa.int64()))
     assert len(arr) == 3
     assert arr.null_count == 1
@@ -290,6 +300,7 @@ def test_nested_arrays(seq, factory):
     assert arr.to_pylist() == [[], [1, 2], None]
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 def test_nested_fixed_size_list(seq):
     # sequence of lists
@@ -334,10 +345,12 @@ def test_sequence_all_none(seq):
     assert arr.to_pylist() == [None, None]
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
 def test_sequence_integer(seq, np_scalar_pa_type):
-    np_scalar, pa_type = np_scalar_pa_type
+    np_str_scalar, pa_type = np_scalar_pa_type
+    np_scalar = getattr(np, np_str_scalar)
     expected = [1, None, 3, None,
                 np.iinfo(np_scalar).min, np.iinfo(np_scalar).max]
     arr = pa.array(seq(expected), type=pa_type)
@@ -347,12 +360,12 @@ def test_sequence_integer(seq, np_scalar_pa_type):
     assert arr.to_pylist() == expected
 
 
+@pytest.mark.numpy
 @parametrize_with_collections_types
-@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
-def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
+@pytest.mark.parametrize("pa_type", pa_int_types)
+def test_sequence_integer_np_nan(seq, pa_type):
     # ARROW-2806: numpy.nan is a double value and thus should produce
     # a double array.
-    _, pa_type = np_scalar_pa_type
     with pytest.raises(ValueError):
         pa.array(seq([np.nan]), type=pa_type, from_pandas=False)
 
@@ -364,12 +377,12 @@ def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
     assert arr.to_pylist() == expected
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
-@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
-def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type):
+@pytest.mark.parametrize("pa_type", pa_int_types)
+def test_sequence_integer_nested_np_nan(seq, pa_type):
     # ARROW-2806: numpy.nan is a double value and thus should produce
     # a double array.
-    _, pa_type = np_scalar_pa_type
     with pytest.raises(ValueError):
         pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False)
 
@@ -391,10 +404,12 @@ def test_sequence_integer_inferred(seq):
     assert arr.to_pylist() == expected
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
 def test_sequence_numpy_integer(seq, np_scalar_pa_type):
-    np_scalar, pa_type = np_scalar_pa_type
+    np_str_scalar, pa_type = np_scalar_pa_type
+    np_scalar = getattr(np, np_str_scalar)
     expected = [np_scalar(1), None, np_scalar(3), None,
                 np_scalar(np.iinfo(np_scalar).min),
                 np_scalar(np.iinfo(np_scalar).max)]
@@ -405,10 +420,12 @@ def test_sequence_numpy_integer(seq, np_scalar_pa_type):
     assert arr.to_pylist() == expected
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
 def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
-    np_scalar, pa_type = np_scalar_pa_type
+    np_str_scalar, pa_type = np_scalar_pa_type
+    np_scalar = getattr(np, np_str_scalar)
     expected = [np_scalar(1), None, np_scalar(3), None]
     expected += [np_scalar(np.iinfo(np_scalar).min),
                  np_scalar(np.iinfo(np_scalar).max)]
@@ -434,6 +451,7 @@ def test_broken_integers(seq):
         pa.array(seq(data), type=pa.int64())
 
 
+@pytest.mark.numpy
 def test_numpy_scalars_mixed_type():
     # ARROW-4324
     data = [np.int32(10), np.float32(0.5)]
@@ -448,6 +466,7 @@ def test_numpy_scalars_mixed_type():
     assert arr.equals(expected)
 
 
+@pytest.mark.numpy
 @pytest.mark.xfail(reason="Type inference for uint64 not implemented",
                    raises=OverflowError)
 def test_uint64_max_convert():
@@ -491,7 +510,7 @@ def test_integer_from_string_error(seq, typ):
 
 def test_convert_with_mask():
     data = [1, 2, 3, 4, 5]
-    mask = np.array([False, True, False, False, True])
+    mask = [False, True, False, False, True]
 
     result = pa.array(data, mask=mask)
     expected = pa.array([1, None, 3, 4, None])
@@ -559,6 +578,7 @@ def test_double_integer_coerce_representable_range():
         pa.array(invalid_values2)
 
 
+@pytest.mark.numpy
 def test_float32_integer_coerce_representable_range():
     f32 = np.float32
     valid_values = [f32(1.5), 1 << 24, -(1 << 24)]
@@ -587,14 +607,16 @@ def test_mixed_sequence_errors():
         pa.array([1.5, 'foo'])
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
-@pytest.mark.parametrize("np_scalar,pa_type", [
-    (np.float16, pa.float16()),
-    (np.float32, pa.float32()),
-    (np.float64, pa.float64())
+@pytest.mark.parametrize("np_str_scalar,pa_type", [
+    ("float16", pa.float16()),
+    ("float32", pa.float32()),
+    ("float64", pa.float64())
 ])
 @pytest.mark.parametrize("from_pandas", [True, False])
-def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
+def test_sequence_numpy_double(seq, np_str_scalar, pa_type, from_pandas):
+    np_scalar = getattr(np, np_str_scalar)
     data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
     arr = pa.array(seq(data), from_pandas=from_pandas)
     assert len(arr) == 6
@@ -616,27 +638,29 @@ def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
         assert np.isnan(arr.to_pylist()[5])
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("from_pandas", [True, False])
-@pytest.mark.parametrize("inner_seq", [np.array, list])
-def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
+def test_ndarray_nested_numpy_double(from_pandas):
     # ARROW-2806
-    data = np.array([
-        inner_seq([1., 2.]),
-        inner_seq([1., 2., 3.]),
-        inner_seq([np.nan]),
-        None
-    ], dtype=object)
-    arr = pa.array(data, from_pandas=from_pandas)
-    assert len(arr) == 4
-    assert arr.null_count == 1
-    assert arr.type == pa.list_(pa.float64())
-    if from_pandas:
-        assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
-    else:
-        np.testing.assert_equal(arr.to_pylist(),
-                                [[1., 2.], [1., 2., 3.], [np.nan], None])
+    for inner_seq in (np.array, list):
+        data = np.array([
+            inner_seq([1., 2.]),
+            inner_seq([1., 2., 3.]),
+            inner_seq([np.nan]),
+            None
+        ], dtype=object)
+        arr = pa.array(data, from_pandas=from_pandas)
+        assert len(arr) == 4
+        assert arr.null_count == 1
+        assert arr.type == pa.list_(pa.float64())
+        if from_pandas:
+            assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
+        else:
+            np.testing.assert_equal(arr.to_pylist(),
+                                    [[1., 2.], [1., 2., 3.], [np.nan], None])
 
 
+@pytest.mark.numpy
 def test_nested_ndarray_in_object_array():
     # ARROW-4350
     arr = np.empty(2, dtype=object)
@@ -664,6 +688,7 @@ def test_nested_ndarray_in_object_array():
     assert result.to_pylist() == [[[1], [2]], [[1], [2]]]
 
 
+@pytest.mark.numpy
 @pytest.mark.xfail(reason=("Type inference for multidimensional ndarray "
                            "not yet implemented"),
                    raises=AssertionError)
@@ -682,6 +707,7 @@ def test_multidimensional_ndarray_as_nested_list():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(('data', 'value_type'), [
     ([True, False], pa.bool_()),
     ([None, None], pa.null()),
@@ -711,6 +737,7 @@ def test_list_array_from_object_ndarray(data, value_type):
     assert arr.to_pylist() == [data]
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(('data', 'value_type'), [
     ([[1, 2], [3]], pa.list_(pa.int64())),
     ([[1, 2], [3, 4]], pa.list_(pa.int64(), 2)),
@@ -730,13 +757,14 @@ def test_array_ignore_nan_from_pandas():
     # See ARROW-4324, this reverts logic that was introduced in
     # ARROW-2240
     with pytest.raises(ValueError):
-        pa.array([np.nan, 'str'])
+        pa.array([float("nan"), 'str'])
 
-    arr = pa.array([np.nan, 'str'], from_pandas=True)
+    arr = pa.array([float("nan"), 'str'], from_pandas=True)
     expected = pa.array([None, 'str'])
     assert arr.equals(expected)
 
 
+@pytest.mark.numpy
 def test_nested_ndarray_different_dtypes():
     data = [
         np.array([1, 2, 3], dtype='int64'),
@@ -1238,6 +1266,7 @@ def test_sequence_timestamp_out_of_bounds_nanosecond():
     assert arr.to_pylist()[0] == datetime.datetime(2262, 4, 12)
 
 
+@pytest.mark.numpy
 def test_sequence_numpy_timestamp():
     data = [
         np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)),
@@ -1407,14 +1436,25 @@ class CustomClass():
             pa.array([1, CustomClass()], type=ty)
 
 
-@pytest.mark.parametrize('np_scalar', [True, False])
-def test_sequence_duration(np_scalar):
+def test_sequence_duration():
     td1 = datetime.timedelta(2, 3601, 1)
     td2 = datetime.timedelta(1, 100, 1000)
-    if np_scalar:
-        data = [np.timedelta64(td1), None, np.timedelta64(td2)]
-    else:
-        data = [td1, None, td2]
+    data = [td1, None, td2]
+
+    arr = pa.array(data)
+    assert len(arr) == 3
+    assert arr.type == pa.duration('us')
+    assert arr.null_count == 1
+    assert arr[0].as_py() == td1
+    assert arr[1].as_py() is None
+    assert arr[2].as_py() == td2
+
+
+@pytest.mark.numpy
+def test_sequence_duration_np_scalar():
+    td1 = datetime.timedelta(2, 3601, 1)
+    td2 = datetime.timedelta(1, 100, 1000)
+    data = [np.timedelta64(td1), None, np.timedelta64(td2)]
 
     arr = pa.array(data)
     assert len(arr) == 3
@@ -1480,6 +1520,7 @@ def test_sequence_duration_nested_lists_with_explicit_type(factory):
     assert arr.to_pylist() == data
 
 
+@pytest.mark.numpy
 def test_sequence_duration_nested_lists_numpy():
     td1 = datetime.timedelta(1, 1, 1000)
     td2 = datetime.timedelta(1, 100)
@@ -1769,6 +1810,7 @@ def test_struct_from_dicts_bytes_keys():
     ]
 
 
+@pytest.mark.numpy
 def test_struct_from_tuples():
     ty = pa.struct([pa.field('a', pa.int32()),
                     pa.field('b', pa.string()),
@@ -1915,6 +1957,7 @@ def test_struct_from_mixed_sequence():
         pa.array(data, type=ty)
 
 
+@pytest.mark.numpy
 def test_struct_from_dicts_inference():
     expected_type = pa.struct([pa.field('a', pa.int64()),
                                pa.field('b', pa.string()),
@@ -1992,7 +2035,7 @@ def test_structarray_from_arrays_coerce():
 
 
 def test_decimal_array_with_none_and_nan():
-    values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')]
+    values = [decimal.Decimal('1.234'), None, float("nan"), decimal.Decimal('nan')]
 
     with pytest.raises(TypeError):
         # ARROW-6227: Without from_pandas=True, NaN is considered a float
@@ -2215,6 +2258,7 @@ def test_roundtrip_nanosecond_resolution_pandas_temporal_objects():
     ]
 
 
+@pytest.mark.numpy
 @h.given(past.all_arrays)
 def test_array_to_pylist_roundtrip(arr):
     seq = arr.to_pylist()
@@ -2498,6 +2542,7 @@ def test_array_accepts_pyarrow_scalar(seq, data, scalar_data, value_type):
     assert expect.equals(result)
 
 
+@pytest.mark.numpy
 @parametrize_with_collections_types
 def test_array_accepts_pyarrow_scalar_errors(seq):
     sequence = seq([pa.scalar(1), pa.scalar("a"), pa.scalar(3.0)])
diff --git a/python/pyarrow/tests/test_cpp_internals.py b/python/pyarrow/tests/test_cpp_internals.py
index 83800b77f89..7508d8f0b98 100644
--- a/python/pyarrow/tests/test_cpp_internals.py
+++ b/python/pyarrow/tests/test_cpp_internals.py
@@ -18,6 +18,8 @@
 import os.path
 from os.path import join as pjoin
 
+import pytest
+
 from pyarrow._pyarrow_cpp_tests import get_cpp_tests
 
 
@@ -26,10 +28,16 @@ def inject_cpp_tests(ns):
     Inject C++ tests as Python functions into namespace `ns` (a dict).
     """
     for case in get_cpp_tests():
+
         def wrapper(case=case):
             case()
         wrapper.__name__ = wrapper.__qualname__ = case.name
         wrapper.__module__ = ns['__name__']
+        # Add numpy or pandas marks if the test requires it
+        if 'numpy' in case.name:
+            wrapper = pytest.mark.numpy(wrapper)
+        elif 'pandas' in case.name:
+            wrapper = pytest.mark.pandas(wrapper)
         ns[case.name] = wrapper
 
 
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 112129d9602..dcf96f68c4d 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -24,6 +24,7 @@
 import io
 import itertools
 import os
+import random
 import select
 import shutil
 import signal
@@ -36,8 +37,6 @@
 
 import pytest
 
-import numpy as np
-
 import pyarrow as pa
 from pyarrow.csv import (
     open_csv, read_csv, ReadOptions, ParseOptions, ConvertOptions, ISO8601,
@@ -54,18 +53,32 @@ def generate_col_names():
             yield first + second
 
 
+def split_rows(arr, num_cols, num_rows):
+    # Split a num_cols x num_rows array into rows
+    for i in range(0, num_rows * num_cols, num_cols):
+        yield arr[i:i + num_cols]
+
+
+def split_columns(arr, num_cols, num_rows):
+    # Split a num_cols x num_rows array into columns
+    for i in range(0, num_cols):
+        yield arr[i::num_cols]
+
+
 def make_random_csv(num_cols=2, num_rows=10, linesep='\r\n', write_names=True):
-    arr = np.random.RandomState(42).randint(0, 1000, size=(num_cols, num_rows))
+    rnd = random.Random(42)
+    arr = [rnd.randint(0, 1000) for _ in range(num_cols * num_rows)]
     csv = io.StringIO()
     col_names = list(itertools.islice(generate_col_names(), num_cols))
     if write_names:
         csv.write(",".join(col_names))
         csv.write(linesep)
-    for row in arr.T:
+    for row in split_rows(arr, num_cols, num_rows):
         csv.write(",".join(map(str, row)))
         csv.write(linesep)
     csv = csv.getvalue().encode()
-    columns = [pa.array(a, type=pa.int64()) for a in arr]
+    columns = [pa.array(row, type=pa.int64())
+               for row in split_columns(arr, num_cols, num_rows)]
     expected = pa.Table.from_arrays(columns, col_names)
     return csv, expected
 
@@ -127,6 +140,25 @@ def __ne__(self, other):
                 other.result != self.result)
 
 
+def test_split_rows_and_columns_utility():
+    num_cols = 5
+    num_rows = 2
+    arr = [x for x in range(1, 11)]
+    rows = list(split_rows(arr, num_cols, num_rows))
+    assert rows == [
+        [1, 2, 3, 4, 5],
+        [6, 7, 8, 9, 10]
+    ]
+    columns = list(split_columns(arr, num_cols, num_rows))
+    assert columns == [
+        [1, 6],
+        [2, 7],
+        [3, 8],
+        [4, 9],
+        [5, 10]
+    ]
+
+
 def test_read_options(pickle_module):
     cls = ReadOptions
     opts = cls()
@@ -520,6 +552,7 @@ def test_skip_rows_after_names(self):
             assert (values[opts.skip_rows + opts.skip_rows_after_names:] ==
                     table_dict[name])
 
+    @pytest.mark.numpy
     def test_row_number_offset_in_errors(self):
         # Row numbers are only correctly counted in serial reads
         def format_msg(msg_format, row, *args):
@@ -1802,6 +1835,7 @@ def test_header_skip_rows(self):
         with pytest.raises(StopIteration):
             assert reader.read_next_batch()
 
+    @pytest.mark.numpy
     def test_skip_rows_after_names(self):
         super().test_skip_rows_after_names()
 
diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py
index d55be651b15..a71fa036503 100644
--- a/python/pyarrow/tests/test_cuda.py
+++ b/python/pyarrow/tests/test_cuda.py
@@ -26,7 +26,10 @@
 import pytest
 
 import pyarrow as pa
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    pytestmark = pytest.mark.numpy
 
 
 cuda = pytest.importorskip("pyarrow.cuda")
diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py b/python/pyarrow/tests/test_cuda_numba_interop.py
index ff1722d278d..876f3c7f761 100644
--- a/python/pyarrow/tests/test_cuda_numba_interop.py
+++ b/python/pyarrow/tests/test_cuda_numba_interop.py
@@ -17,7 +17,10 @@
 
 import pytest
 import pyarrow as pa
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    pytestmark = pytest.mark.numpy
 
 dtypes = ['uint8', 'int16', 'float32']
 cuda = pytest.importorskip("pyarrow.cuda")
diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py
index 0eeae5d65f7..937d927f831 100644
--- a/python/pyarrow/tests/test_cython.py
+++ b/python/pyarrow/tests/test_cython.py
@@ -80,6 +80,9 @@ def check_cython_example_module(mod):
         mod.cast_scalar(scal, pa.list_(pa.int64()))
 
 
+# NumPy is still a required build dependency. It is present in our
+# headers and is required to build for the cython tests.
+@pytest.mark.numpy
 @pytest.mark.cython
 def test_cython_api(tmpdir):
     """
@@ -162,6 +165,7 @@ def test_cython_api(tmpdir):
                               env=subprocess_env)
 
 
+@pytest.mark.numpy
 @pytest.mark.cython
 def test_visit_strings(tmpdir):
     with tmpdir.as_cwd():
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 3b0284bcb74..276cd2e78db 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -20,6 +20,7 @@
 import os
 import pathlib
 import posixpath
+import random
 import sys
 import tempfile
 import textwrap
@@ -28,7 +29,10 @@
 from shutil import copytree
 from urllib.parse import quote
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
@@ -684,8 +688,8 @@ def test_partitioning():
 
     # test partitioning roundtrip
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))],
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)],
         names=["f1", "f2", "part"]
     )
     partitioning_schema = pa.schema([("part", pa.string())])
@@ -2494,7 +2498,7 @@ def _create_partitioned_dataset(basedir):
         pq.write_table(table.slice(3*i, 3), part / "test.parquet")
 
     full_table = table.append_column(
-        "part", pa.array(np.repeat([0, 1, 2], 3), type=pa.int32()))
+        "part", pa.array([0] * 3 + [1] * 3 + [2] * 3, type=pa.int32()))
 
     return full_table, path
 
@@ -2532,7 +2536,7 @@ def test_open_dataset_partitioned_directory(tempdir, dataset_reader, pickle_modu
 
     result = dataset.to_table()
     expected = table.append_column(
-        "part", pa.array(np.repeat([0, 1, 2], 3), type=pa.int8()))
+        "part", pa.array([0] * 3 + [1] * 3 + [2] * 3, type=pa.int8()))
     assert result.equals(expected)
 
 
@@ -3567,7 +3571,7 @@ def _create_parquet_dataset_simple(root_path):
     metadata_collector = []
 
     for i in range(4):
-        table = pa.table({'f1': [i] * 10, 'f2': np.random.randn(10)})
+        table = pa.table({'f1': [i] * 10, 'f2': [random.random() for _ in range(10)]})
         pq.write_to_dataset(
             table, str(root_path), metadata_collector=metadata_collector
         )
@@ -4255,7 +4259,7 @@ def compare_tables_ignoring_order(t1, t2):
 
 
 def _generate_random_int_array(size=4, min=1, max=10):
-    return np.random.randint(min, max, size)
+    return [random.randint(min, max) for _ in range(size)]
 
 
 def _generate_data_and_columns(num_of_columns, num_of_records):
@@ -4513,8 +4517,8 @@ def file_visitor(written_file):
 
 def test_write_table(tempdir):
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)
     ], names=["f1", "f2", "part"])
 
     base_dir = tempdir / 'single'
@@ -4560,8 +4564,8 @@ def file_visitor(written_file):
 
 def test_write_table_multiple_fragments(tempdir):
     table = pa.table([
-        pa.array(range(10)), pa.array(np.random.randn(10)),
-        pa.array(np.repeat(['a', 'b'], 5))
+        pa.array(range(10)), pa.array(random.random() for _ in range(10)),
+        pa.array(['a'] * 5 + ['b'] * 5)
     ], names=["f1", "f2", "part"])
     table = pa.concat_tables([table]*2)
 
@@ -4596,8 +4600,8 @@ def test_write_table_multiple_fragments(tempdir):
 
 def test_write_iterable(tempdir):
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)
     ], names=["f1", "f2", "part"])
 
     base_dir = tempdir / 'inmemory_iterable'
@@ -4618,8 +4622,8 @@ def test_write_iterable(tempdir):
 
 def test_write_scanner(tempdir, dataset_reader):
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)
     ], names=["f1", "f2", "part"])
     dataset = ds.dataset(table)
 
@@ -4647,7 +4651,7 @@ def test_write_table_partitioned_dict(tempdir):
     # specifying the dictionary values explicitly
     table = pa.table([
         pa.array(range(20)),
-        pa.array(np.repeat(['a', 'b'], 10)).dictionary_encode(),
+        pa.array(['a'] * 10 + ['b'] * 10).dictionary_encode(),
     ], names=['col', 'part'])
 
     partitioning = ds.partitioning(table.select(["part"]).schema)
@@ -4666,6 +4670,7 @@ def test_write_table_partitioned_dict(tempdir):
     assert result.equals(table)
 
 
+@pytest.mark.numpy
 @pytest.mark.parquet
 def test_write_dataset_parquet(tempdir):
     table = pa.table([
@@ -4712,8 +4717,8 @@ def test_write_dataset_parquet(tempdir):
 
 def test_write_dataset_csv(tempdir):
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)
     ], names=["f1", "f2", "chr1"])
 
     base_dir = tempdir / 'csv_dataset'
@@ -4739,8 +4744,8 @@ def test_write_dataset_csv(tempdir):
 @pytest.mark.parquet
 def test_write_dataset_parquet_file_visitor(tempdir):
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)
     ], names=["f1", "f2", "part"])
 
     visitor_called = False
@@ -4763,7 +4768,7 @@ def test_partition_dataset_parquet_file_visitor(tempdir):
     f1_vals = [item for chunk in range(4) for item in [chunk] * 10]
     f2_vals = [item*10 for chunk in range(4) for item in [chunk] * 10]
     table = pa.table({'f1': f1_vals, 'f2': f2_vals,
-                      'part': np.repeat(['a', 'b'], 20)})
+                      'part': ['a'] * 20 + ['b'] * 20})
 
     root_path = tempdir / 'partitioned'
     partitioning = ds.partitioning(
@@ -4841,8 +4846,8 @@ def test_write_dataset_s3(s3_example_simple):
     )
 
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))],
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)],
         names=["f1", "f2", "part"]
     )
     part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
@@ -4918,8 +4923,8 @@ def test_write_dataset_s3_put_only(s3_server):
     _configure_s3_limited_user(s3_server, _minio_put_only_policy)
 
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))],
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a']*10 + ['b'] * 10)],
         names=["f1", "f2", "part"]
     )
     part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py
index 0d8b4a152ab..eb79121b1cd 100644
--- a/python/pyarrow/tests/test_dataset_encryption.py
+++ b/python/pyarrow/tests/test_dataset_encryption.py
@@ -17,7 +17,7 @@
 
 import base64
 from datetime import timedelta
-import numpy as np
+import random
 import pyarrow.fs as fs
 import pyarrow as pa
 
@@ -187,7 +187,10 @@ def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes:
 
     row_count = 2**15 + 1
     table = pa.Table.from_arrays(
-        [pa.array(np.random.rand(row_count), type=pa.float32())], names=["foo"]
+        [pa.array(
+            [random.random() for _ in range(row_count)],
+            type=pa.float32()
+        )], names=["foo"]
     )
 
     kms_config = pe.KmsConnectionConfig()
diff --git a/python/pyarrow/tests/test_dlpack.py b/python/pyarrow/tests/test_dlpack.py
index 7cf3f4acdbd..a18accb1e21 100644
--- a/python/pyarrow/tests/test_dlpack.py
+++ b/python/pyarrow/tests/test_dlpack.py
@@ -19,12 +19,20 @@
 from functools import wraps
 import pytest
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 from pyarrow.vendored.version import Version
 
 
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not numpy'
+pytestmark = pytest.mark.numpy
+
+
 def PyCapsule_IsValid(capsule, name):
     return ctypes.pythonapi.PyCapsule_IsValid(ctypes.py_object(capsule), name) == 1
 
@@ -52,45 +60,45 @@ def wrapper(*args, **kwargs):
 
 @check_bytes_allocated
 @pytest.mark.parametrize(
-    ('value_type', 'np_type'),
+    ('value_type', 'np_type_str'),
     [
-        (pa.uint8(), np.uint8),
-        (pa.uint16(), np.uint16),
-        (pa.uint32(), np.uint32),
-        (pa.uint64(), np.uint64),
-        (pa.int8(), np.int8),
-        (pa.int16(), np.int16),
-        (pa.int32(), np.int32),
-        (pa.int64(), np.int64),
-        (pa.float16(), np.float16),
-        (pa.float32(), np.float32),
-        (pa.float64(), np.float64),
+        (pa.uint8(), "uint8"),
+        (pa.uint16(), "uint16"),
+        (pa.uint32(), "uint32"),
+        (pa.uint64(), "uint64"),
+        (pa.int8(), "int8"),
+        (pa.int16(), "int16"),
+        (pa.int32(), "int32"),
+        (pa.int64(), "int64"),
+        (pa.float16(), "float16"),
+        (pa.float32(), "float32"),
+        (pa.float64(), "float64"),
     ]
 )
-def test_dlpack(value_type, np_type):
+def test_dlpack(value_type, np_type_str):
     if Version(np.__version__) < Version("1.24.0"):
         pytest.skip("No dlpack support in numpy versions older than 1.22.0, "
                     "strict keyword in assert_array_equal added in numpy version "
                     "1.24.0")
 
-    expected = np.array([1, 2, 3], dtype=np_type)
+    expected = np.array([1, 2, 3], dtype=np.dtype(np_type_str))
     arr = pa.array(expected, type=value_type)
     check_dlpack_export(arr, expected)
 
     arr_sliced = arr.slice(1, 1)
-    expected = np.array([2], dtype=np_type)
+    expected = np.array([2], dtype=np.dtype(np_type_str))
     check_dlpack_export(arr_sliced, expected)
 
     arr_sliced = arr.slice(0, 1)
-    expected = np.array([1], dtype=np_type)
+    expected = np.array([1], dtype=np.dtype(np_type_str))
     check_dlpack_export(arr_sliced, expected)
 
     arr_sliced = arr.slice(1)
-    expected = np.array([2, 3], dtype=np_type)
+    expected = np.array([2, 3], dtype=np.dtype(np_type_str))
     check_dlpack_export(arr_sliced, expected)
 
     arr_zero = pa.array([], type=value_type)
-    expected = np.array([], dtype=np_type)
+    expected = np.array([], dtype=np.dtype(np_type_str))
     check_dlpack_export(arr_zero, expected)
 
 
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index aacbd2cb6e7..b74eca75bdc 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -23,12 +23,15 @@
 from uuid import uuid4, UUID
 import sys
 
-import numpy as np
+import pytest
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
 import pyarrow as pa
 from pyarrow.vendored.version import Version
 
-import pytest
-
 
 @contextlib.contextmanager
 def registered_extension_type(ext_type):
@@ -562,6 +565,7 @@ def test_ext_array_pickling(pickle_module):
         assert arr.storage.to_pylist() == [b"foo", b"bar"]
 
 
+@pytest.mark.numpy
 def test_ext_array_conversion_to_numpy():
     storage1 = pa.array([1, 2, 3], type=pa.int64())
     storage2 = pa.array([b"123", b"456", b"789"], type=pa.binary(3))
@@ -619,6 +623,7 @@ def struct_w_ext_data():
     return [sarr1, sarr2]
 
 
+@pytest.mark.numpy
 def test_struct_w_ext_array_to_numpy(struct_w_ext_data):
     # ARROW-15291
     # Check that we don't segfault when trying to build
@@ -1233,6 +1238,7 @@ def test_parquet_extension_nested_in_extension(tmpdir):
             assert table == orig_table
 
 
+@pytest.mark.numpy
 def test_to_numpy():
     period_type = PeriodType('D')
     storage = pa.array([1, 2, 3, 4], pa.int64())
@@ -1285,7 +1291,11 @@ def test_empty_take():
     (["cat", "dog", "horse"], LabelType)
 ))
 @pytest.mark.parametrize(
-    "into", ["to_numpy", pytest.param("to_pandas", marks=pytest.mark.pandas)])
+    "into", [
+        pytest.param("to_numpy", marks=pytest.mark.numpy),
+        pytest.param("to_pandas", marks=pytest.mark.pandas)
+    ]
+)
 def test_extension_array_to_numpy_pandas(data, ty, into):
     storage = pa.array(data)
     ext_arr = pa.ExtensionArray.from_storage(ty(), storage)
@@ -1301,6 +1311,7 @@ def test_extension_array_to_numpy_pandas(data, ty, into):
         assert np.array_equal(result, expected)
 
 
+@pytest.mark.numpy
 def test_array_constructor():
     ext_type = IntegerType()
     storage = pa.array([1, 2, 3], type=pa.int64())
@@ -1333,6 +1344,7 @@ def test_array_constructor_from_pandas():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 @pytest.mark.cython
 def test_cpp_extension_in_python(tmpdir):
     from .test_cython import (
@@ -1430,38 +1442,45 @@ def test_tensor_type():
     assert tensor_type.permutation is None
 
 
-@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32()))
-def test_tensor_class_methods(value_type):
+@pytest.mark.numpy
+@pytest.mark.parametrize("np_type_str", ("int8", "int64", "float32"))
+def test_tensor_class_methods(np_type_str):
     from numpy.lib.stride_tricks import as_strided
-    arrow_type = pa.from_numpy_dtype(value_type)
+    arrow_type = pa.from_numpy_dtype(np.dtype(np_type_str))
 
     tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3])
     storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
                        pa.list_(arrow_type, 6))
     arr = pa.ExtensionArray.from_storage(tensor_type, storage)
     expected = np.array(
-        [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=value_type)
+        [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+        dtype=np.dtype(np_type_str)
+    )
     np.testing.assert_array_equal(arr.to_tensor(), expected)
     np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected)
 
-    expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=value_type)
+    expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=np.dtype(np_type_str))
     result = arr[1:].to_numpy_ndarray()
     np.testing.assert_array_equal(result, expected)
 
     values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]
-    flat_arr = np.array(values[0], dtype=value_type)
-    bw = value_type.itemsize
+    flat_arr = np.array(values[0], dtype=np.dtype(np_type_str))
+    bw = np.dtype(np_type_str).itemsize
     storage = pa.array(values, pa.list_(arrow_type, 12))
 
     tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 1, 2])
     result = pa.ExtensionArray.from_storage(tensor_type, storage)
     expected = np.array(
-        [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], dtype=value_type)
+        [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]],
+        dtype=np.dtype(np_type_str)
+    )
     np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
 
     result = flat_arr.reshape(1, 2, 3, 2)
     expected = np.array(
-        [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type)
+        [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]],
+        dtype=np.dtype(np_type_str)
+    )
     np.testing.assert_array_equal(result, expected)
 
     tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 2, 1])
@@ -1482,25 +1501,27 @@ def test_tensor_class_methods(value_type):
     assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw)
 
 
-@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32()))
-def test_tensor_array_from_numpy(value_type):
+@pytest.mark.numpy
+@pytest.mark.parametrize("np_type_str", ("int8", "int64", "float32"))
+def test_tensor_array_from_numpy(np_type_str):
     from numpy.lib.stride_tricks import as_strided
-    arrow_type = pa.from_numpy_dtype(value_type)
+    arrow_type = pa.from_numpy_dtype(np.dtype(np_type_str))
 
     arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
-                   dtype=value_type, order="C")
+                   dtype=np.dtype(np_type_str), order="C")
     tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
     assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType)
     assert tensor_array_from_numpy.type.value_type == arrow_type
     assert tensor_array_from_numpy.type.shape == [2, 3]
 
     arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
-                   dtype=value_type, order="F")
+                   dtype=np.dtype(np_type_str), order="F")
     with pytest.raises(ValueError, match="First stride needs to be largest"):
         pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
 
-    flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type)
-    bw = value_type.itemsize
+    flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+                        dtype=np.dtype(np_type_str))
+    bw = np.dtype(np_type_str).itemsize
 
     arr = flat_arr.reshape(1, 3, 4)
     tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
@@ -1518,23 +1539,26 @@ def test_tensor_array_from_numpy(value_type):
     arr = flat_arr.reshape(1, 2, 3, 2)
     result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
     expected = np.array(
-        [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type)
+        [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]],
+        dtype=np.dtype(np_type_str)
+    )
     np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
 
-    arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], dtype=value_type)
+    arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
+                   dtype=np.dtype(np_type_str))
     expected = arr[1:]
     result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray()
     np.testing.assert_array_equal(result, expected)
 
-    arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type)
+    arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.dtype(np_type_str))
     with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"):
         pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
 
-    arr = np.array(1, dtype=value_type)
+    arr = np.array(1, dtype=np.dtype(np_type_str))
     with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"):
         pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
 
-    arr = np.array([], dtype=value_type)
+    arr = np.array([], dtype=np.dtype(np_type_str))
 
     with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"):
         pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((0)))
@@ -1546,6 +1570,7 @@ def test_tensor_array_from_numpy(value_type):
         pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((3, 0, 2)))
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("tensor_type", (
     pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]),
     pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]),
@@ -1801,6 +1826,7 @@ def test_bool8_to_bool_conversion():
     assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr
 
 
+@pytest.mark.numpy
 def test_bool8_to_numpy_conversion():
     arr = pa.ExtensionArray.from_storage(
         pa.bool8(),
@@ -1841,6 +1867,7 @@ def test_bool8_to_numpy_conversion():
     assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address
 
 
+@pytest.mark.numpy
 def test_bool8_from_numpy_conversion():
     np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
     canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage(
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 00640064890..18c8cd5b654 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -23,7 +23,10 @@
 import hypothesis as h
 import hypothesis.strategies as st
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 import pyarrow.tests.strategies as past
@@ -135,6 +138,7 @@ def f():
     pytest.raises(exc, f)
 
 
+@pytest.mark.numpy
 def test_dataset(version):
     num_values = (100, 100)
     num_files = 5
@@ -354,6 +358,7 @@ def test_buffer_bounds_error(version):
         _check_arrow_roundtrip(table)
 
 
+@pytest.mark.numpy
 def test_boolean_object_nulls(version):
     repeats = 100
     table = pa.Table.from_arrays(
@@ -540,6 +545,7 @@ def test_read_columns(version):
                             columns=['boo', 'woo'])
 
 
+@pytest.mark.numpy
 def test_overwritten_file(version):
     path = random_path()
     TEST_FILES.append(path)
@@ -675,6 +681,7 @@ def test_v2_compression_options():
         write_feather(df, buf, compression='snappy')
 
 
+@pytest.mark.numpy
 def test_v2_lz4_default_compression():
     # ARROW-8750: Make sure that the compression=None option selects lz4 if
     # it's available
@@ -807,6 +814,7 @@ def test_nested_types(compression):
     _check_arrow_roundtrip(table, compression=compression)
 
 
+@pytest.mark.numpy
 @h.given(past.all_tables, st.sampled_from(["uncompressed", "lz4", "zstd"]))
 def test_roundtrip(table, compression):
     _check_arrow_roundtrip(table, compression=compression)
diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py
index 832c6a2dbdf..029a2695b9f 100644
--- a/python/pyarrow/tests/test_flight.py
+++ b/python/pyarrow/tests/test_flight.py
@@ -28,7 +28,10 @@
 import traceback
 import json
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 import pyarrow as pa
 
@@ -1588,6 +1591,7 @@ def test_flight_do_put_metadata():
                 assert idx == server_idx
 
 
+@pytest.mark.numpy
 def test_flight_do_put_limit():
     """Try a simple do_put call with a size limit."""
     large_batch = pa.RecordBatch.from_arrays([
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index ef499a3a8d7..e2df1b1c468 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -25,11 +25,15 @@
 import os
 import pathlib
 import pytest
+import random
 import sys
 import tempfile
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 from pyarrow.util import guid
 from pyarrow import Codec
@@ -464,6 +468,7 @@ def test_buffer_hex(val, expected_hex_buffer):
     assert buf.hex() == expected_hex_buffer
 
 
+@pytest.mark.numpy
 def test_buffer_to_numpy():
     # Make sure creating a numpy array from an arrow buffer works
     byte_array = bytearray(20)
@@ -476,6 +481,7 @@ def test_buffer_to_numpy():
     assert array.base == buf
 
 
+@pytest.mark.numpy
 def test_buffer_from_numpy():
     # C-contiguous
     arr = np.arange(12, dtype=np.int8).reshape((3, 4))
@@ -493,6 +499,7 @@ def test_buffer_from_numpy():
         buf = pa.py_buffer(arr.T[::2])
 
 
+@pytest.mark.numpy
 def test_buffer_address():
     b1 = b'some data!'
     b2 = bytearray(b1)
@@ -513,6 +520,7 @@ def test_buffer_address():
     assert buf.address == arr.ctypes.data
 
 
+@pytest.mark.numpy
 def test_buffer_equals():
     # Buffer.equals() returns true iff the buffers have the same contents
     def eq(a, b):
@@ -624,6 +632,7 @@ def test_buffer_hashing():
         hash(pa.py_buffer(b'123'))
 
 
+@pytest.mark.numpy
 def test_buffer_protocol_respects_immutability():
     # ARROW-3228; NumPy's frombuffer ctor determines whether a buffer-like
     # object is mutable by first attempting to get a mutable buffer using
@@ -635,6 +644,7 @@ def test_buffer_protocol_respects_immutability():
     assert not numpy_ref.flags.writeable
 
 
+@pytest.mark.numpy
 def test_foreign_buffer():
     obj = np.array([1, 2], dtype=np.int32)
     addr = obj.__array_interface__["data"][0]
@@ -669,6 +679,7 @@ def test_allocate_buffer_resizable():
     assert buf.size == 200
 
 
+@pytest.mark.numpy
 def test_non_cpu_buffer(pickle_module):
     cuda = pytest.importorskip("pyarrow.cuda")
     ctx = cuda.Context(0)
@@ -798,6 +809,7 @@ def test_cache_options_pickling(pickle_module):
         assert pickle_module.loads(pickle_module.dumps(option)) == option
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("compression", [
     pytest.param(
         "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
@@ -838,6 +850,7 @@ def test_compress_decompress(compression):
         pa.decompress(compressed_bytes, codec=compression)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("compression", [
     pytest.param(
         "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
@@ -996,6 +1009,7 @@ def make_buffer(bytes_obj):
     assert refcount_before == sys.getrefcount(val)
 
 
+@pytest.mark.numpy
 def test_nativefile_write_memoryview():
     f = pa.BufferOutputStream()
     data = b'ok'
@@ -1058,8 +1072,8 @@ def test_mock_output_stream():
 @pytest.fixture
 def sample_disk_data(request, tmpdir):
     SIZE = 4096
-    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
-    data = arr.tobytes()[:SIZE]
+    arr = [random.randint(0, 255) for _ in range(SIZE)]
+    data = bytes(arr[:SIZE])
 
     path = os.path.join(str(tmpdir), guid())
 
@@ -1146,8 +1160,8 @@ def test_memory_map_writer(tmpdir):
     if sys.platform == "emscripten":
         pytest.xfail("Multiple memory maps to same file don't work on emscripten")
     SIZE = 4096
-    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
-    data = arr.tobytes()[:SIZE]
+    arr = [random.randint(0, 255) for _ in range(SIZE)]
+    data = bytes(arr[:SIZE])
 
     path = os.path.join(str(tmpdir), guid())
     with open(path, 'wb') as f:
@@ -1187,9 +1201,9 @@ def test_memory_map_writer(tmpdir):
 
 def test_memory_map_resize(tmpdir):
     SIZE = 4096
-    arr = np.random.randint(0, 256, size=SIZE).astype(np.uint8)
-    data1 = arr.tobytes()[:(SIZE // 2)]
-    data2 = arr.tobytes()[(SIZE // 2):]
+    arr = [random.randint(0, 255) for _ in range(SIZE)]
+    data1 = bytes(arr[:(SIZE // 2)])
+    data2 = bytes(arr[(SIZE // 2):])
 
     path = os.path.join(str(tmpdir), guid())
 
@@ -1202,7 +1216,7 @@ def test_memory_map_resize(tmpdir):
     mmap.close()
 
     with open(path, 'rb') as f:
-        assert f.read() == arr.tobytes()
+        assert f.read() == bytes(arr[:SIZE])
 
 
 def test_memory_zero_length(tmpdir):
@@ -1241,8 +1255,8 @@ def test_memory_map_deref_remove(tmpdir):
 
 def test_os_file_writer(tmpdir):
     SIZE = 4096
-    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
-    data = arr.tobytes()[:SIZE]
+    arr = [random.randint(0, 255) for _ in range(SIZE)]
+    data = bytes(arr[:SIZE])
 
     path = os.path.join(str(tmpdir), guid())
     with open(path, 'wb') as f:
@@ -1523,6 +1537,7 @@ def test_buffered_input_stream_detach_non_seekable():
         raw.seek(2)
 
 
+@pytest.mark.numpy
 def test_buffered_output_stream():
     np_buf = np.zeros(100, dtype=np.int8)  # zero-initialized buffer
     buf = pa.py_buffer(np_buf)
@@ -1540,6 +1555,7 @@ def test_buffered_output_stream():
     assert np_buf[:10].tobytes() == b'123456789\0'
 
 
+@pytest.mark.numpy
 def test_buffered_output_stream_detach():
     np_buf = np.zeros(100, dtype=np.int8)  # zero-initialized buffer
     buf = pa.py_buffer(np_buf)
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 1e5242efe40..4be5792a92f 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -20,11 +20,15 @@
 import io
 import pathlib
 import pytest
+import random
 import socket
 import threading
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 from pyarrow.tests.util import changed_environ, invoke_script
@@ -59,7 +63,7 @@ def write_batches(self, num_batches=5, as_table=False):
         batches = []
         for i in range(num_batches):
             batch = pa.record_batch(
-                [np.random.randn(nrows),
+                [[random.random() for _ in range(nrows)],
                  ['foo', None, 'bar', 'bazbaz', 'qux']],
                 schema=schema)
             batches.append(batch)
@@ -422,7 +426,7 @@ def test_stream_simple_roundtrip(stream_fixture, use_legacy_ipc_format):
 @pytest.mark.zstd
 def test_compression_roundtrip():
     sink = io.BytesIO()
-    values = np.random.randint(0, 3, 10000)
+    values = [random.randint(0, 3) for _ in range(10000)]
     table = pa.Table.from_arrays([values], names=["values"])
 
     options = pa.ipc.IpcWriteOptions(compression='zstd')
diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py
index a0a61742663..3bb4440e897 100644
--- a/python/pyarrow/tests/test_json.py
+++ b/python/pyarrow/tests/test_json.py
@@ -23,7 +23,10 @@
 import string
 import unittest
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
@@ -297,6 +300,7 @@ def test_explicit_schema_with_unexpected_behaviour(self):
                            match="JSON parse error: unexpected field"):
             self.read_bytes(rows, parse_options=opts)
 
+    @pytest.mark.numpy
     def test_small_random_json(self):
         data, expected = make_random_json(num_cols=2, num_rows=10)
         table = self.read_bytes(data)
@@ -304,6 +308,7 @@ def test_small_random_json(self):
         assert table.equals(expected)
         assert table.to_pydict() == expected.to_pydict()
 
+    @pytest.mark.numpy
     def test_load_large_json(self):
         data, expected = make_random_json(num_cols=2, num_rows=100100)
         # set block size is 10MB
@@ -312,6 +317,7 @@ def test_load_large_json(self):
         assert table.num_rows == 100100
         assert expected.num_rows == 100100
 
+    @pytest.mark.numpy
     def test_stress_block_sizes(self):
         # Test a number of small block sizes to stress block stitching
         data_base, expected = make_random_json(num_cols=2, num_rows=100)
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 208812c3ac4..178a073ed59 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -27,9 +27,18 @@
 
 import hypothesis as h
 import hypothesis.strategies as st
-import numpy as np
-import numpy.testing as npt
 import pytest
+try:
+    import numpy as np
+    import numpy.testing as npt
+    try:
+        _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning
+    except AttributeError:
+        from numpy.exceptions import (
+            VisibleDeprecationWarning as _np_VisibleDeprecationWarning
+        )
+except ImportError:
+    np = None
 
 from pyarrow.pandas_compat import get_logical_type, _pandas_api
 from pyarrow.tests.util import invoke_script, random_ascii, rands
@@ -51,14 +60,6 @@
     pass
 
 
-try:
-    _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning
-except AttributeError:
-    from numpy.exceptions import (
-        VisibleDeprecationWarning as _np_VisibleDeprecationWarning
-    )
-
-
 # Marks all of the tests in this module
 pytestmark = pytest.mark.pandas
 
@@ -1202,9 +1203,11 @@ def test_datetime64_to_date32(self):
 
     @pytest.mark.parametrize('mask', [
         None,
-        np.array([True, False, False, True, False, False]),
+        [True, False, False, True, False, False],
     ])
     def test_pandas_datetime_to_date64(self, mask):
+        if mask:
+            mask = np.array(mask)
         s = pd.to_datetime([
             '2018-05-10T00:00:00',
             '2018-05-11T00:00:00',
@@ -1608,7 +1611,8 @@ def test_array_from_pandas_date_with_mask(self):
         assert pa.Array.from_pandas(expected).equals(result)
 
     @pytest.mark.skipif(
-        Version('1.16.0') <= Version(np.__version__) < Version('1.16.1'),
+        np is not None and Version('1.16.0') <= Version(
+            np.__version__) < Version('1.16.1'),
         reason='Until numpy/numpy#12745 is resolved')
     def test_fixed_offset_timezone(self):
         df = pd.DataFrame({
@@ -2921,23 +2925,23 @@ class TestConvertMisc:
     """
 
     type_pairs = [
-        (np.int8, pa.int8()),
-        (np.int16, pa.int16()),
-        (np.int32, pa.int32()),
-        (np.int64, pa.int64()),
-        (np.uint8, pa.uint8()),
-        (np.uint16, pa.uint16()),
-        (np.uint32, pa.uint32()),
-        (np.uint64, pa.uint64()),
-        (np.float16, pa.float16()),
-        (np.float32, pa.float32()),
-        (np.float64, pa.float64()),
+        ("int8", pa.int8()),
+        ("int16", pa.int16()),
+        ("int32", pa.int32()),
+        ("int64", pa.int64()),
+        ("uint8", pa.uint8()),
+        ("uint16", pa.uint16()),
+        ("uint32", pa.uint32()),
+        ("uint64", pa.uint64()),
+        ("float16", pa.float16()),
+        ("float32", pa.float32()),
+        ("float64", pa.float64()),
         # XXX unsupported
         # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])),
-        (np.object_, pa.string()),
-        (np.object_, pa.binary()),
-        (np.object_, pa.binary(10)),
-        (np.object_, pa.list_(pa.int64())),
+        ("object", pa.string()),
+        ("object", pa.binary()),
+        ("object", pa.binary(10)),
+        ("object", pa.list_(pa.int64())),
     ]
 
     def test_all_none_objects(self):
@@ -2950,8 +2954,8 @@ def test_all_none_category(self):
         _check_pandas_roundtrip(df)
 
     def test_empty_arrays(self):
-        for dtype, pa_type in self.type_pairs:
-            arr = np.array([], dtype=dtype)
+        for dtype_str, pa_type in self.type_pairs:
+            arr = np.array([], dtype=np.dtype(dtype_str))
             _check_array_roundtrip(arr, type=pa_type)
 
     def test_non_threaded_conversion(self):
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index bc50697e1be..3f4a53c473e 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -20,7 +20,10 @@
 import pytest
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -40,7 +43,6 @@
     (1, pa.int64(), pa.Int64Scalar),
     (1, pa.uint64(), pa.UInt64Scalar),
     (1.0, None, pa.DoubleScalar),
-    (np.float16(1.0), pa.float16(), pa.HalfFloatScalar),
     (1.0, pa.float32(), pa.FloatScalar),
     (decimal.Decimal("1.123"), None, pa.Decimal128Scalar),
     (decimal.Decimal("1.1234567890123456789012345678901234567890"),
@@ -98,6 +100,40 @@ def test_basics(value, ty, klass, pickle_module):
     assert wr() is None
 
 
+# This test is a copy of test_basics but only for float16 (HalfFloatScalar)
+# which currently requires a numpy scalar to create it. The test collection
+# fails if numpy is used on the parametrization when not present.
+@pytest.mark.numpy
+def test_basics_np_required(pickle_module):
+    value, ty, klass = np.float16(1.0), pa.float16(), pa.HalfFloatScalar
+    s = pa.scalar(value, type=ty)
+    s.validate()
+    s.validate(full=True)
+    assert isinstance(s, klass)
+    assert s.as_py() == value
+    assert s == pa.scalar(value, type=ty)
+    assert s != value
+    assert s != "else"
+    assert hash(s) == hash(s)
+    assert s.is_valid is True
+    assert s != None  # noqa: E711
+
+    s = pa.scalar(None, type=s.type)
+    assert s.is_valid is False
+    assert s.as_py() is None
+    assert s != pa.scalar(value, type=ty)
+
+    # test pickle roundtrip
+    restored = pickle_module.loads(pickle_module.dumps(s))
+    assert s.equals(restored)
+
+    # test that scalars are weak-referenceable
+    wr = weakref.ref(s)
+    assert wr() is not None
+    del s
+    assert wr() is None
+
+
 def test_invalid_scalar():
     s = pc.cast(pa.scalar(b"\xff"), pa.string(), safe=False)
     s.validate()
@@ -202,14 +238,15 @@ def test_numerics():
     assert str(s) == "1.5"
     assert s.as_py() == 1.5
 
-    # float16
-    s = pa.scalar(np.float16(0.5), type='float16')
-    assert isinstance(s, pa.HalfFloatScalar)
-    # on numpy2 repr(np.float16(0.5)) == "np.float16(0.5)"
-    # on numpy1 repr(np.float16(0.5)) == "0.5"
-    assert repr(s) == f"<pyarrow.HalfFloatScalar: {np.float16(0.5)!r}>"
-    assert str(s) == "0.5"
-    assert s.as_py() == 0.5
+    if np is not None:
+        # float16
+        s = pa.scalar(np.float16(0.5), type='float16')
+        assert isinstance(s, pa.HalfFloatScalar)
+        # on numpy2 repr(np.float16(0.5)) == "np.float16(0.5)"
+        # on numpy1 repr(np.float16(0.5)) == "0.5"
+        assert repr(s) == f"<pyarrow.HalfFloatScalar: {np.float16(0.5)!r}>"
+        assert str(s) == "0.5"
+        assert s.as_py() == 0.5
 
 
 def test_decimal128():
@@ -434,6 +471,7 @@ def test_timestamp_fixed_offset_print():
     assert str(arr[0]) == "1970-01-01 02:00:00+02:00"
 
 
+@pytest.mark.numpy
 def test_duration():
     arr = np.array([0, 3600000000000], dtype='timedelta64[ns]')
 
@@ -559,6 +597,7 @@ def test_list(ty, klass):
         s[2]
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('ty', [
     pa.list_(pa.int64()),
     pa.large_list(pa.int64()),
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
index 1b05c58384c..bdcb6c2b42d 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -20,7 +20,10 @@
 import weakref
 
 import pytest
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pyarrow as pa
 
 import pyarrow.tests.util as test_util
@@ -185,6 +188,7 @@ def test_time_types():
         pa.time64('s')
 
 
+@pytest.mark.numpy
 def test_from_numpy_dtype():
     cases = [
         (np.dtype('bool'), pa.bool_()),
diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py
index aa7da0a7420..7ba9e2b3e13 100644
--- a/python/pyarrow/tests/test_sparse_tensor.py
+++ b/python/pyarrow/tests/test_sparse_tensor.py
@@ -19,7 +19,10 @@
 import sys
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    pytestmark = pytest.mark.numpy
 import pyarrow as pa
 
 try:
diff --git a/python/pyarrow/tests/test_strategies.py b/python/pyarrow/tests/test_strategies.py
index 14fc949928c..da50bcda52f 100644
--- a/python/pyarrow/tests/test_strategies.py
+++ b/python/pyarrow/tests/test_strategies.py
@@ -17,6 +17,8 @@
 
 import hypothesis as h
 
+import pytest
+
 import pyarrow as pa
 import pyarrow.tests.strategies as past
 
@@ -36,11 +38,13 @@ def test_schemas(schema):
     assert isinstance(schema, pa.lib.Schema)
 
 
+@pytest.mark.numpy
 @h.given(past.all_arrays)
 def test_arrays(array):
     assert isinstance(array, pa.lib.Array)
 
 
+@pytest.mark.numpy
 @h.given(past.arrays(past.primitive_types, nullable=False))
 def test_array_nullability(array):
     assert array.null_count == 0
@@ -56,6 +60,7 @@ def test_record_batches(record_bath):
     assert isinstance(record_bath, pa.lib.RecordBatch)
 
 
+@pytest.mark.numpy
 @h.given(past.all_tables)
 def test_tables(table):
     assert isinstance(table, pa.lib.Table)
diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py
index 40700e47413..01d468cd9e9 100644
--- a/python/pyarrow/tests/test_substrait.py
+++ b/python/pyarrow/tests/test_substrait.py
@@ -608,6 +608,7 @@ def table_provider(names, schema):
     assert res_tb == expected
 
 
+@pytest.mark.numpy
 def test_scalar_aggregate_udf_basic(varargs_agg_func_fixture):
 
     test_table = pa.Table.from_pydict(
@@ -756,6 +757,7 @@ def table_provider(names, _):
     assert res_tb == expected_tb
 
 
+@pytest.mark.numpy
 def test_hash_aggregate_udf_basic(varargs_agg_func_fixture):
 
     test_table = pa.Table.from_pydict(
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index cd38909edf3..3b60cff2d8c 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -20,7 +20,10 @@
 import sys
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -125,6 +128,7 @@ def test_chunked_array_can_combine_chunks_with_no_chunks():
     ).combine_chunks() == pa.array([], type=pa.bool_())
 
 
+@pytest.mark.numpy
 def test_chunked_array_to_numpy():
     data = pa.chunked_array([
         [1, 2, 3],
@@ -173,6 +177,7 @@ def test_chunked_array_str():
 ]"""
 
 
+@pytest.mark.numpy
 def test_chunked_array_getitem():
     data = [
         pa.array([1, 2, 3]),
@@ -972,12 +977,14 @@ def check_tensors(tensor, expected_tensor, type, size):
     assert tensor.strides == expected_tensor.strides
 
 
-@pytest.mark.parametrize('typ', [
-    np.uint8, np.uint16, np.uint32, np.uint64,
-    np.int8, np.int16, np.int32, np.int64,
-    np.float32, np.float64,
+@pytest.mark.numpy
+@pytest.mark.parametrize('typ_str', [
+    "uint8", "uint16", "uint32", "uint64",
+    "int8", "int16", "int32", "int64",
+    "float32", "float64",
 ])
-def test_recordbatch_to_tensor_uniform_type(typ):
+def test_recordbatch_to_tensor_uniform_type(typ_str):
+    typ = np.dtype(typ_str)
     arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90]
     arr3 = [100, 100, 100, 100, 100, 100, 100, 100, 100]
@@ -1031,6 +1038,7 @@ def test_recordbatch_to_tensor_uniform_type(typ):
     check_tensors(result, expected, pa.from_numpy_dtype(typ), 15)
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_uniform_float_16():
     arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90]
@@ -1054,6 +1062,7 @@ def test_recordbatch_to_tensor_uniform_float_16():
     check_tensors(result, expected, pa.float16(), 27)
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_mixed_type():
     # uint16 + int16 = int32
     arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
@@ -1105,6 +1114,7 @@ def test_recordbatch_to_tensor_mixed_type():
     assert result.strides == expected.strides
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_unsupported_mixed_type_with_float16():
     arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90]
@@ -1124,6 +1134,7 @@ def test_recordbatch_to_tensor_unsupported_mixed_type_with_float16():
         batch.to_tensor()
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_nan():
     arr1 = [1, 2, 3, 4, np.nan, 6, 7, 8, 9]
     arr2 = [10, 20, 30, 40, 50, 60, 70, np.nan, 90]
@@ -1144,6 +1155,7 @@ def test_recordbatch_to_tensor_nan():
     assert result.strides == expected.strides
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_null():
     arr1 = [1, 2, 3, 4, None, 6, 7, 8, 9]
     arr2 = [10, 20, 30, 40, 50, 60, 70, None, 90]
@@ -1204,6 +1216,7 @@ def test_recordbatch_to_tensor_null():
     assert result.strides == expected.strides
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_empty():
     batch = pa.RecordBatch.from_arrays(
         [
@@ -1295,6 +1308,7 @@ def test_slice_zero_length_table():
     table.to_pandas()
 
 
+@pytest.mark.numpy
 def test_recordbatchlist_schema_equals():
     a1 = np.array([1], dtype='uint32')
     a2 = np.array([4.0, 5.0], dtype='float64')
@@ -2130,6 +2144,7 @@ def test_table_unsafe_casting(cls):
     assert casted_table.equals(expected_table)
 
 
+@pytest.mark.numpy
 def test_invalid_table_construct():
     array = np.array([0, 1], dtype=np.uint8)
     u8 = pa.uint8()
@@ -3287,6 +3302,7 @@ def test_table_sort_by(cls):
     assert sorted_tab_dict["b"] == ["foo", "car", "bar", "foobar"]
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("constructor", [pa.table, pa.record_batch])
 def test_numpy_asarray(constructor):
     table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"])
@@ -3319,6 +3335,7 @@ def test_numpy_asarray(constructor):
     assert result.dtype == "int32"
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("constructor", [pa.table, pa.record_batch])
 def test_numpy_array_protocol(constructor):
     table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"])
diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py
index 29c6de65b16..debb1066280 100644
--- a/python/pyarrow/tests/test_tensor.py
+++ b/python/pyarrow/tests/test_tensor.py
@@ -21,7 +21,10 @@
 import warnings
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    pytestmark = pytest.mark.numpy
 import pyarrow as pa
 
 
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index d673f956527..cc680939ac4 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -30,7 +30,10 @@
     tzst = None
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pyarrow as pa
 import pyarrow.types as types
 import pyarrow.tests.strategies as past
@@ -1265,14 +1268,16 @@ def test_field_modified_copies():
 
 def test_is_integer_value():
     assert pa.types.is_integer_value(1)
-    assert pa.types.is_integer_value(np.int64(1))
+    if np is not None:
+        assert pa.types.is_integer_value(np.int64(1))
     assert not pa.types.is_integer_value('1')
 
 
 def test_is_float_value():
     assert not pa.types.is_float_value(1)
     assert pa.types.is_float_value(1.)
-    assert pa.types.is_float_value(np.float64(1))
+    if np is not None:
+        assert pa.types.is_float_value(np.float64(1))
     assert not pa.types.is_float_value('1.0')
 
 
@@ -1280,8 +1285,9 @@ def test_is_boolean_value():
     assert not pa.types.is_boolean_value(1)
     assert pa.types.is_boolean_value(True)
     assert pa.types.is_boolean_value(False)
-    assert pa.types.is_boolean_value(np.bool_(True))
-    assert pa.types.is_boolean_value(np.bool_(False))
+    if np is not None:
+        assert pa.types.is_boolean_value(np.bool_(True))
+        assert pa.types.is_boolean_value(np.bool_(False))
 
 
 @h.settings(suppress_health_check=(h.HealthCheck.too_slow,))
diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py
index 22fefbbb58b..93004a30618 100644
--- a/python/pyarrow/tests/test_udf.py
+++ b/python/pyarrow/tests/test_udf.py
@@ -18,7 +18,10 @@
 
 import pytest
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 from pyarrow import compute as pc
@@ -749,6 +752,7 @@ def test_udt_datasource1_exception():
         _test_datasource1_udt(datasource1_exception)
 
 
+@pytest.mark.numpy
 def test_scalar_agg_basic(unary_agg_func_fixture):
     arr = pa.array([10.0, 20.0, 30.0, 40.0, 50.0], pa.float64())
     result = pc.call_function("mean_udf", [arr])
@@ -756,6 +760,7 @@ def test_scalar_agg_basic(unary_agg_func_fixture):
     assert result == expected
 
 
+@pytest.mark.numpy
 def test_scalar_agg_empty(unary_agg_func_fixture):
     empty = pa.array([], pa.float64())
 
@@ -775,6 +780,7 @@ def test_scalar_agg_wrong_output_type(wrong_output_type_agg_func_fixture):
         pc.call_function("y=wrong_output_type(x)", [arr])
 
 
+@pytest.mark.numpy
 def test_scalar_agg_varargs(varargs_agg_func_fixture):
     arr1 = pa.array([10, 20, 30, 40, 50], pa.int64())
     arr2 = pa.array([1.0, 2.0, 3.0, 4.0, 5.0], pa.float64())
@@ -786,6 +792,7 @@ def test_scalar_agg_varargs(varargs_agg_func_fixture):
     assert result == expected
 
 
+@pytest.mark.numpy
 def test_scalar_agg_exception(exception_agg_func_fixture):
     arr = pa.array([10, 20, 30, 40, 50, 60], pa.int64())
 
@@ -793,6 +800,7 @@ def test_scalar_agg_exception(exception_agg_func_fixture):
         pc.call_function("y=exception_len(x)", [arr])
 
 
+@pytest.mark.numpy
 def test_hash_agg_basic(unary_agg_func_fixture):
     arr1 = pa.array([10.0, 20.0, 30.0, 40.0, 50.0], pa.float64())
     arr2 = pa.array([4, 2, 1, 2, 1], pa.int32())
@@ -811,6 +819,7 @@ def test_hash_agg_basic(unary_agg_func_fixture):
     assert result.sort_by('id') == expected.sort_by('id')
 
 
+@pytest.mark.numpy
 def test_hash_agg_empty(unary_agg_func_fixture):
     arr1 = pa.array([], pa.float64())
     arr2 = pa.array([], pa.int32())
@@ -841,6 +850,7 @@ def test_hash_agg_wrong_output_type(wrong_output_type_agg_func_fixture):
         table.group_by("id").aggregate([("value", "y=wrong_output_type(x)")])
 
 
+@pytest.mark.numpy
 def test_hash_agg_exception(exception_agg_func_fixture):
     arr1 = pa.array([10, 20, 30, 40, 50], pa.int64())
     arr2 = pa.array([4, 2, 1, 2, 1], pa.int32())
@@ -850,6 +860,7 @@ def test_hash_agg_exception(exception_agg_func_fixture):
         table.group_by("id").aggregate([("value", "y=exception_len(x)")])
 
 
+@pytest.mark.numpy
 def test_hash_agg_random(sum_agg_func_fixture):
     """Test hash aggregate udf with randomly sampled data"""
 
diff --git a/python/pyarrow/tests/test_without_numpy.py b/python/pyarrow/tests/test_without_numpy.py
new file mode 100644
index 00000000000..55c12602ce8
--- /dev/null
+++ b/python/pyarrow/tests/test_without_numpy.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import pyarrow as pa
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not nonumpy'
+pytestmark = pytest.mark.nonumpy
+
+
+def test_array_to_np():
+    arr = pa.array(range(10))
+
+    msg = "Cannot return a numpy.ndarray if NumPy is not present"
+
+    with pytest.raises(ImportError, match=msg):
+        arr.to_numpy()
+
+
+def test_chunked_array_to_np():
+    data = pa.chunked_array([
+        [1, 2, 3],
+        [4, 5, 6],
+        []
+    ])
+    msg = "Cannot return a numpy.ndarray if NumPy is not present"
+
+    with pytest.raises(ImportError, match=msg):
+        data.to_numpy()
+
+
+def test_tensor_to_np():
+    tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2])
+    arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+    storage = pa.array(arr, pa.list_(pa.int32(), 4))
+    tensor_array = pa.ExtensionArray.from_storage(tensor_type, storage)
+
+    tensor = tensor_array.to_tensor()
+    msg = "Cannot return a numpy.ndarray if NumPy is not present"
+
+    with pytest.raises(ImportError, match=msg):
+        tensor.to_numpy()
diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py
index 638eee98073..aa6dd21f800 100644
--- a/python/pyarrow/tests/util.py
+++ b/python/pyarrow/tests/util.py
@@ -22,7 +22,6 @@
 import contextlib
 import decimal
 import gc
-import numpy as np
 import os
 import random
 import re
@@ -110,27 +109,15 @@ def randdecimal(precision, scale):
 
 
 def random_ascii(length):
-    return bytes(np.random.randint(65, 123, size=length, dtype='i1'))
+    return bytes([random.randint(65, 122) for i in range(length)])
 
 
 def rands(nchars):
     """
     Generate one random string.
     """
-    RANDS_CHARS = np.array(
-        list(string.ascii_letters + string.digits), dtype=(np.str_, 1))
-    return "".join(np.random.choice(RANDS_CHARS, nchars))
-
-
-def make_dataframe():
-    import pandas as pd
-
-    N = 30
-    df = pd.DataFrame(
-        {col: np.random.randn(N) for col in string.ascii_uppercase[:4]},
-        index=pd.Index([rands(10) for _ in range(N)])
-    )
-    return df
+    RANDS_CHARS = list(string.ascii_letters + string.digits)
+    return "".join(random.choice(RANDS_CHARS) for i in range(nchars))
 
 
 def memory_leak_check(f, metric='rss', threshold=1 << 17, iterations=10,
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index f83ecc3aa43..a46caff1f21 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -33,42 +33,50 @@ from cython import sizeof
 
 # These are imprecise because the type (in pandas 0.x) depends on the presence
 # of nulls
-cdef dict _pandas_type_map = {
-    _Type_NA: np.object_,  # NaNs
-    _Type_BOOL: np.bool_,
-    _Type_INT8: np.int8,
-    _Type_INT16: np.int16,
-    _Type_INT32: np.int32,
-    _Type_INT64: np.int64,
-    _Type_UINT8: np.uint8,
-    _Type_UINT16: np.uint16,
-    _Type_UINT32: np.uint32,
-    _Type_UINT64: np.uint64,
-    _Type_HALF_FLOAT: np.float16,
-    _Type_FLOAT: np.float32,
-    _Type_DOUBLE: np.float64,
-    # Pandas does not support [D]ay, so default to [ms] for date32
-    _Type_DATE32: np.dtype('datetime64[ms]'),
-    _Type_DATE64: np.dtype('datetime64[ms]'),
-    _Type_TIMESTAMP: {
-        's': np.dtype('datetime64[s]'),
-        'ms': np.dtype('datetime64[ms]'),
-        'us': np.dtype('datetime64[us]'),
-        'ns': np.dtype('datetime64[ns]'),
-    },
-    _Type_DURATION: {
-        's': np.dtype('timedelta64[s]'),
-        'ms': np.dtype('timedelta64[ms]'),
-        'us': np.dtype('timedelta64[us]'),
-        'ns': np.dtype('timedelta64[ns]'),
-    },
-    _Type_BINARY: np.object_,
-    _Type_FIXED_SIZE_BINARY: np.object_,
-    _Type_STRING: np.object_,
-    _Type_LIST: np.object_,
-    _Type_MAP: np.object_,
-    _Type_DECIMAL128: np.object_,
-}
+cdef dict _pandas_type_map = {}
+
+
+def _get_pandas_type_map():
+    global _pandas_type_map
+    if not _pandas_type_map:
+        _pandas_type_map.update({
+            _Type_NA: np.object_,  # NaNs
+            _Type_BOOL: np.bool_,
+            _Type_INT8: np.int8,
+            _Type_INT16: np.int16,
+            _Type_INT32: np.int32,
+            _Type_INT64: np.int64,
+            _Type_UINT8: np.uint8,
+            _Type_UINT16: np.uint16,
+            _Type_UINT32: np.uint32,
+            _Type_UINT64: np.uint64,
+            _Type_HALF_FLOAT: np.float16,
+            _Type_FLOAT: np.float32,
+            _Type_DOUBLE: np.float64,
+            # Pandas does not support [D]ay, so default to [ms] for date32
+            _Type_DATE32: np.dtype('datetime64[ms]'),
+            _Type_DATE64: np.dtype('datetime64[ms]'),
+            _Type_TIMESTAMP: {
+                's': np.dtype('datetime64[s]'),
+                'ms': np.dtype('datetime64[ms]'),
+                'us': np.dtype('datetime64[us]'),
+                'ns': np.dtype('datetime64[ns]'),
+            },
+            _Type_DURATION: {
+                's': np.dtype('timedelta64[s]'),
+                'ms': np.dtype('timedelta64[ms]'),
+                'us': np.dtype('timedelta64[us]'),
+                'ns': np.dtype('timedelta64[ns]'),
+            },
+            _Type_BINARY: np.object_,
+            _Type_FIXED_SIZE_BINARY: np.object_,
+            _Type_STRING: np.object_,
+            _Type_LIST: np.object_,
+            _Type_MAP: np.object_,
+            _Type_DECIMAL128: np.object_,
+        })
+    return _pandas_type_map
+
 
 cdef dict _pep3118_type_map = {
     _Type_INT8: b'b',
@@ -149,14 +157,15 @@ def _is_primitive(Type type):
 
 def _get_pandas_type(arrow_type, coerce_to_ns=False):
     cdef Type type_id = arrow_type.id
-    if type_id not in _pandas_type_map:
+    cdef dict pandas_type_map = _get_pandas_type_map()
+    if type_id not in pandas_type_map:
         return None
     if coerce_to_ns:
         # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
         if type_id == _Type_DURATION:
             return np.dtype('timedelta64[ns]')
         return np.dtype('datetime64[ns]')
-    pandas_type = _pandas_type_map[type_id]
+    pandas_type = pandas_type_map[type_id]
     if isinstance(pandas_type, dict):
         unit = getattr(arrow_type, 'unit', None)
         pandas_type = pandas_type.get(unit, None)

From 44d3f763c083280d2480a735a9ab45243af48232 Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Mon, 2 Sep 2024 23:06:10 +0800
Subject: [PATCH 105/157] GH-43758: [C++] Compute: More comment in RowEncoder
 (#43763)

### Rationale for this change

Some comments for RowEncoder

### What changes are included in this PR?

Some comments for RowEncoder

### Are these changes tested?

Covered by existing

### Are there any user-facing changes?

no

* GitHub Issue: #43758

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: mwish <1506118561@qq.com>
Co-authored-by: mwish <anmmscs_maple@qq.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Co-authored-by: Rossi Sun <zanmato1984@gmail.com>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 cpp/src/arrow/compute/light_array_internal.h  |   6 +-
 .../arrow/compute/row/row_encoder_internal.cc |  56 +++----
 .../arrow/compute/row/row_encoder_internal.h  | 154 ++++++++++++++----
 cpp/src/arrow/compute/row/row_internal.h      |   2 +-
 4 files changed, 155 insertions(+), 63 deletions(-)

diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h
index b8e48f096ba..5adb06e5400 100644
--- a/cpp/src/arrow/compute/light_array_internal.h
+++ b/cpp/src/arrow/compute/light_array_internal.h
@@ -65,12 +65,12 @@ struct ARROW_EXPORT KeyColumnMetadata {
   /// If this is true the column will have a validity buffer and
   /// a data buffer and the third buffer will be unused.
   bool is_fixed_length;
-  /// \brief True if this column is the null type
+  /// \brief True if this column is the null type(NA).
   bool is_null_type;
   /// \brief The number of bytes for each item
   ///
   /// Zero has a special meaning, indicating a bit vector with one bit per value if it
-  /// isn't a null type column.
+  /// isn't a null type column. Generally, this means that the column is a boolean type.
   ///
   /// For a varying-length binary column this represents the number of bytes per offset.
   uint32_t fixed_length;
@@ -405,7 +405,7 @@ class ARROW_EXPORT ExecBatchBuilder {
 
   int num_rows() const { return values_.empty() ? 0 : values_[0].num_rows(); }
 
-  static int num_rows_max() { return 1 << kLogNumRows; }
+  static constexpr int num_rows_max() { return 1 << kLogNumRows; }
 
  private:
   static constexpr int kLogNumRows = 15;
diff --git a/cpp/src/arrow/compute/row/row_encoder_internal.cc b/cpp/src/arrow/compute/row/row_encoder_internal.cc
index 414cc6793a5..0965e4e8f95 100644
--- a/cpp/src/arrow/compute/row/row_encoder_internal.cc
+++ b/cpp/src/arrow/compute/row/row_encoder_internal.cc
@@ -145,41 +145,37 @@ void FixedWidthKeyEncoder::AddLengthNull(int32_t* length) {
 
 Status FixedWidthKeyEncoder::Encode(const ExecValue& data, int64_t batch_length,
                                     uint8_t** encoded_bytes) {
+  auto handle_next_valid_value = [&](std::string_view bytes) {
+    auto& encoded_ptr = *encoded_bytes++;
+    *encoded_ptr++ = kValidByte;
+    memcpy(encoded_ptr, bytes.data(), byte_width_);
+    encoded_ptr += byte_width_;
+  };
+  auto handle_next_null_value = [&] {
+    auto& encoded_ptr = *encoded_bytes++;
+    *encoded_ptr++ = kNullByte;
+    memset(encoded_ptr, 0, byte_width_);
+    encoded_ptr += byte_width_;
+  };
   if (data.is_array()) {
     ArraySpan viewed = data.array;
+    // The original type might not be FixedSizeBinaryType, but it would
+    // treat the input as binary data.
     auto view_ty = fixed_size_binary(byte_width_);
     viewed.type = view_ty.get();
-    VisitArraySpanInline<FixedSizeBinaryType>(
-        viewed,
-        [&](std::string_view bytes) {
-          auto& encoded_ptr = *encoded_bytes++;
-          *encoded_ptr++ = kValidByte;
-          memcpy(encoded_ptr, bytes.data(), byte_width_);
-          encoded_ptr += byte_width_;
-        },
-        [&] {
-          auto& encoded_ptr = *encoded_bytes++;
-          *encoded_ptr++ = kNullByte;
-          memset(encoded_ptr, 0, byte_width_);
-          encoded_ptr += byte_width_;
-        });
+    VisitArraySpanInline<FixedSizeBinaryType>(viewed, handle_next_valid_value,
+                                              handle_next_null_value);
   } else {
     const auto& scalar = data.scalar_as<arrow::internal::PrimitiveScalarBase>();
     if (scalar.is_valid) {
-      const std::string_view data = scalar.view();
-      DCHECK_EQ(data.size(), static_cast<size_t>(byte_width_));
+      const std::string_view scalar_data = scalar.view();
+      DCHECK_EQ(scalar_data.size(), static_cast<size_t>(byte_width_));
       for (int64_t i = 0; i < batch_length; i++) {
-        auto& encoded_ptr = *encoded_bytes++;
-        *encoded_ptr++ = kValidByte;
-        memcpy(encoded_ptr, data.data(), data.size());
-        encoded_ptr += byte_width_;
+        handle_next_valid_value(scalar_data);
       }
     } else {
       for (int64_t i = 0; i < batch_length; i++) {
-        auto& encoded_ptr = *encoded_bytes++;
-        *encoded_ptr++ = kNullByte;
-        memset(encoded_ptr, 0, byte_width_);
-        encoded_ptr += byte_width_;
+        handle_next_null_value();
       }
     }
   }
@@ -267,11 +263,11 @@ void RowEncoder::Init(const std::vector<TypeHolder>& column_types, ExecContext*
 
   for (size_t i = 0; i < column_types.size(); ++i) {
     const bool is_extension = column_types[i].id() == Type::EXTENSION;
-    const TypeHolder& type = is_extension
-                                 ? arrow::internal::checked_pointer_cast<ExtensionType>(
-                                       column_types[i].GetSharedPtr())
-                                       ->storage_type()
-                                 : column_types[i];
+    const TypeHolder& type =
+        is_extension
+            ? arrow::internal::checked_cast<const ExtensionType*>(column_types[i].type)
+                  ->storage_type()
+            : column_types[i];
 
     if (is_extension) {
       extension_types_[i] = arrow::internal::checked_pointer_cast<ExtensionType>(
@@ -379,7 +375,7 @@ Result<ExecBatch> RowEncoder::Decode(int64_t num_rows, const int32_t* row_ids) {
       ARROW_ASSIGN_OR_RAISE(out.values[i], ::arrow::internal::GetArrayView(
                                                column_array_data, extension_types_[i]))
     } else {
-      out.values[i] = column_array_data;
+      out.values[i] = std::move(column_array_data);
     }
   }
 
diff --git a/cpp/src/arrow/compute/row/row_encoder_internal.h b/cpp/src/arrow/compute/row/row_encoder_internal.h
index 60eb14af504..4d6cc34af23 100644
--- a/cpp/src/arrow/compute/row/row_encoder_internal.h
+++ b/cpp/src/arrow/compute/row/row_encoder_internal.h
@@ -38,16 +38,41 @@ struct ARROW_EXPORT KeyEncoder {
 
   virtual ~KeyEncoder() = default;
 
+  // Increment the values in the lengths array by the length of the encoded key for the
+  // corresponding value in the given column.
+  //
+  // Generally if Encoder is for a fixed-width type, the length of the encoded key
+  // would add ExtraByteForNull + byte_width.
+  // If Encoder is for a variable-width type, the length would add ExtraByteForNull +
+  // sizeof(Offset) + buffer_size.
+  // If Encoder is for null type, the length would add 0.
   virtual void AddLength(const ExecValue& value, int64_t batch_length,
                          int32_t* lengths) = 0;
 
+  // Increment the length by the length of an encoded null value.
+  // It's a special case for AddLength like `AddLength(Null-Scalar, 1, lengths)`.
   virtual void AddLengthNull(int32_t* length) = 0;
 
+  // Encode the column into the encoded_bytes, which is an array of pointers to each row
+  // buffer.
+  //
+  // If value is an array, the array-size should be batch_length.
+  // If value is a scalar, the value would repeat batch_length times.
+  // NB: The pointers in the encoded_bytes will be advanced as values being encoded into.
   virtual Status Encode(const ExecValue&, int64_t batch_length,
                         uint8_t** encoded_bytes) = 0;
 
+  // Encode a null value into the encoded_bytes, which is an array of pointers to each row
+  // buffer.
+  //
+  // It's a special case for Encode like `Encode(Null-Scalar, 1, encoded_bytes)`.
+  // NB: The pointers in the encoded_bytes will be advanced as values being encoded into.
   virtual void EncodeNull(uint8_t** encoded_bytes) = 0;
 
+  // Decode the encoded key from the encoded_bytes, which is an array of pointers to each
+  // row buffer, into an ArrayData.
+  //
+  // NB: The pointers in the encoded_bytes will be advanced as values being decoded from.
   virtual Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes,
                                                     int32_t length, MemoryPool*) = 0;
 
@@ -94,7 +119,7 @@ struct ARROW_EXPORT FixedWidthKeyEncoder : KeyEncoder {
                                             MemoryPool* pool) override;
 
   std::shared_ptr<DataType> type_;
-  int byte_width_;
+  const int byte_width_;
 };
 
 struct ARROW_EXPORT DictionaryKeyEncoder : FixedWidthKeyEncoder {
@@ -118,6 +143,7 @@ struct ARROW_EXPORT VarLengthKeyEncoder : KeyEncoder {
   void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override {
     if (data.is_array()) {
       int64_t i = 0;
+      ARROW_DCHECK_EQ(data.array.length, batch_length);
       VisitArraySpanInline<T>(
           data.array,
           [&](std::string_view bytes) {
@@ -142,41 +168,34 @@ struct ARROW_EXPORT VarLengthKeyEncoder : KeyEncoder {
 
   Status Encode(const ExecValue& data, int64_t batch_length,
                 uint8_t** encoded_bytes) override {
+    auto handle_next_valid_value = [&encoded_bytes](std::string_view bytes) {
+      auto& encoded_ptr = *encoded_bytes++;
+      *encoded_ptr++ = kValidByte;
+      util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
+      encoded_ptr += sizeof(Offset);
+      memcpy(encoded_ptr, bytes.data(), bytes.size());
+      encoded_ptr += bytes.size();
+    };
+    auto handle_next_null_value = [&encoded_bytes]() {
+      auto& encoded_ptr = *encoded_bytes++;
+      *encoded_ptr++ = kNullByte;
+      util::SafeStore(encoded_ptr, static_cast<Offset>(0));
+      encoded_ptr += sizeof(Offset);
+    };
     if (data.is_array()) {
-      VisitArraySpanInline<T>(
-          data.array,
-          [&](std::string_view bytes) {
-            auto& encoded_ptr = *encoded_bytes++;
-            *encoded_ptr++ = kValidByte;
-            util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
-            encoded_ptr += sizeof(Offset);
-            memcpy(encoded_ptr, bytes.data(), bytes.size());
-            encoded_ptr += bytes.size();
-          },
-          [&] {
-            auto& encoded_ptr = *encoded_bytes++;
-            *encoded_ptr++ = kNullByte;
-            util::SafeStore(encoded_ptr, static_cast<Offset>(0));
-            encoded_ptr += sizeof(Offset);
-          });
+      DCHECK_EQ(data.length(), batch_length);
+      VisitArraySpanInline<T>(data.array, handle_next_valid_value,
+                              handle_next_null_value);
     } else {
       const auto& scalar = data.scalar_as<BaseBinaryScalar>();
       if (scalar.is_valid) {
-        const auto& bytes = *scalar.value;
+        const auto bytes = std::string_view{*scalar.value};
         for (int64_t i = 0; i < batch_length; i++) {
-          auto& encoded_ptr = *encoded_bytes++;
-          *encoded_ptr++ = kValidByte;
-          util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
-          encoded_ptr += sizeof(Offset);
-          memcpy(encoded_ptr, bytes.data(), bytes.size());
-          encoded_ptr += bytes.size();
+          handle_next_valid_value(bytes);
         }
       } else {
         for (int64_t i = 0; i < batch_length; i++) {
-          auto& encoded_ptr = *encoded_bytes++;
-          *encoded_ptr++ = kNullByte;
-          util::SafeStore(encoded_ptr, static_cast<Offset>(0));
-          encoded_ptr += sizeof(Offset);
+          handle_next_null_value();
         }
       }
     }
@@ -250,6 +269,68 @@ struct ARROW_EXPORT NullKeyEncoder : KeyEncoder {
   }
 };
 
+/// RowEncoder encodes ExecSpan to a variable length byte sequence
+/// created by concatenating the encoded form of each column. The encoding
+/// for each column depends on its data type.
+///
+/// This is used to encode columns into row-major format, which will be
+/// beneficial for grouping and joining operations.
+///
+/// Unlike DuckDB and arrow-rs, currently this row format can not help
+/// sortings because the row-format is uncomparable.
+///
+/// # Key Column Encoding
+///
+/// The row format is composed of the the KeyColumn encodings for each,
+/// and the column is encoded as follows:
+/// 1. A null byte for each column, indicating whether the column is null.
+///    "1" for null, "0" for non-null.
+/// 2. The "fixed width" encoding for the column, it would exist whether
+///    the column is null or not.
+/// 3. The "variable payload" encoding for the column, it would exists only
+///    for non-null string/binary columns.
+///    For string/binary columns, the length of the payload is in
+///    "fixed width" part, and the binary contents are in the
+///    "variable payload" part.
+/// 4. Specially, if all columns in a row are null, the caller may decide
+///    to refer to kRowIdForNulls instead of actually encoding/decoding
+///    it using any KeyEncoder. See the comment for encoded_nulls_.
+///
+/// The endianness of the encoded bytes is platform-dependent.
+///
+/// ## Null Type
+///
+/// Null Type is a special case, it doesn't occupy any space in the
+/// encoded row.
+///
+/// ## Fixed Width Type
+///
+/// Fixed Width Type is encoded as a fixed-width byte sequence. For example:
+/// ```
+/// Int8: 5, null, 6
+/// ```
+/// Would be encoded as [0 5], [1 0], [0 6].
+///
+/// ### Dictionary Type
+///
+/// Dictionary Type is encoded as a fixed-width byte sequence using
+/// dictionary  indices, the dictionary should be identical for all
+/// rows.
+///
+/// ## Variable Width Type
+///
+/// Variable Width Type is encoded as:
+/// [null byte, variable-byte length, variable bytes]. For example:
+///
+/// String "abc" Would be encoded as:
+/// 0 ( 1 byte for not null) + 3 ( 4 bytes for length ) + "abc" (payload)
+///
+/// Null string Would be encoded as:
+/// 1 ( 1 byte for null) + 0 ( 4 bytes for length )
+///
+/// # Row Encoding
+///
+/// The row format is the concatenation of the encodings of each column.
 class ARROW_EXPORT RowEncoder {
  public:
   static constexpr int kRowIdForNulls() { return -1; }
@@ -259,6 +340,9 @@ class ARROW_EXPORT RowEncoder {
   Status EncodeAndAppend(const ExecSpan& batch);
   Result<ExecBatch> Decode(int64_t num_rows, const int32_t* row_ids);
 
+  // Returns the encoded representation of the row at index i.
+  // If i is kRowIdForNulls, it returns the pre-encoded all-nulls
+  // row.
   inline std::string encoded_row(int32_t i) const {
     if (i == kRowIdForNulls()) {
       return std::string(reinterpret_cast<const char*>(encoded_nulls_.data()),
@@ -270,14 +354,26 @@ class ARROW_EXPORT RowEncoder {
   }
 
   int32_t num_rows() const {
-    return offsets_.size() == 0 ? 0 : static_cast<int32_t>(offsets_.size() - 1);
+    return offsets_.empty() ? 0 : static_cast<int32_t>(offsets_.size() - 1);
   }
 
  private:
   ExecContext* ctx_{nullptr};
   std::vector<std::shared_ptr<KeyEncoder>> encoders_;
+  // offsets_ vector stores the starting position (offset) of each encoded row
+  // within the bytes_ vector. This allows for quick access to individual rows.
+  //
+  // The size would be num_rows + 1 if not empty, the last element is the total
+  // length of the bytes_ vector.
   std::vector<int32_t> offsets_;
+  // The encoded bytes of all non "kRowIdForNulls" rows.
   std::vector<uint8_t> bytes_;
+  // A pre-encoded constant row with all its columns being null. Useful when
+  // the caller is certain that an entire row is null and then uses kRowIdForNulls
+  // to refer to it.
+  //
+  // EncodeAndAppend would never append this row, but encoded_row and Decode would
+  // return this row when kRowIdForNulls is passed.
   std::vector<uint8_t> encoded_nulls_;
   std::vector<std::shared_ptr<ExtensionType>> extension_types_;
 };
diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h
index 094a9c31efe..3ab86fd1fc6 100644
--- a/cpp/src/arrow/compute/row/row_internal.h
+++ b/cpp/src/arrow/compute/row/row_internal.h
@@ -38,7 +38,7 @@ struct ARROW_EXPORT RowTableMetadata {
   /// For a fixed-length binary row, common size of rows in bytes,
   /// rounded up to the multiple of alignment.
   ///
-  /// For a varying-length binary, size of all encoded fixed-length key columns,
+  /// For a varying-length binary row, size of all encoded fixed-length key columns,
   /// including lengths of varying-length columns, rounded up to the multiple of string
   /// alignment.
   uint32_t fixed_length;

From 9cafbb26681a1488c16edaec231ba55c21543e3a Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Mon, 2 Sep 2024 23:38:24 +0800
Subject: [PATCH 106/157] GH-43768: [C++] Fix the case when boolean_{any|all}
 meets constant input with length in Acero (#43799)

### Rationale for this change

See https://github.com/apache/arrow/issues/43768

### What changes are included in this PR?

Fix the case when boolean_{any|all} meets constant input with length in Acero

### Are these changes tested?

Yes

### Are there any user-facing changes?

no

* GitHub Issue: #43768

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: mwish <1506118561@qq.com>
Co-authored-by: Rossi Sun <zanmato1984@gmail.com>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 cpp/src/arrow/acero/aggregate_node_test.cc    | 52 +++++++++++++++++++
 .../arrow/compute/kernels/aggregate_basic.cc  | 16 +++---
 2 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/acero/aggregate_node_test.cc b/cpp/src/arrow/acero/aggregate_node_test.cc
index d398fb24b73..c623271db9f 100644
--- a/cpp/src/arrow/acero/aggregate_node_test.cc
+++ b/cpp/src/arrow/acero/aggregate_node_test.cc
@@ -210,5 +210,57 @@ TEST(GroupByNode, NoSkipNulls) {
   AssertExecBatchesEqualIgnoringOrder(out_schema, {expected_batch}, out_batches.batches);
 }
 
+TEST(ScalarAggregateNode, AnyAll) {
+  // GH-43768: boolean_any and boolean_all with constant input should work well
+  // when min_count != 0.
+  std::shared_ptr<Schema> in_schema = schema({field("not_used", int32())});
+  std::shared_ptr<Schema> out_schema = schema({field("agg_out", boolean())});
+  struct AnyAllCase {
+    std::string batches_json;
+    Expression literal;
+    std::string expected_json;
+    bool skip_nulls = false;
+    uint32_t min_count = 2;
+  };
+  std::vector<AnyAllCase> cases{
+      {"[[42], [42], [42], [42]]", literal(true), "[[true]]"},
+      {"[[42], [42], [42], [42]]", literal(false), "[[false]]"},
+      {"[[42], [42], [42], [42]]", literal(BooleanScalar{}), "[[null]]"},
+      {"[[42]]", literal(true), "[[null]]"},
+      {"[[42], [42], [42]]", literal(true), "[[true]]"},
+      {"[[42], [42], [42]]", literal(true), "[[null]]", /*skip_nulls=*/false,
+       /*min_count=*/4},
+      {"[[42], [42], [42], [42]]", literal(BooleanScalar{}), "[[null]]",
+       /*skip_nulls=*/true},
+  };
+  for (const AnyAllCase& any_all_case : cases) {
+    for (auto func_name : {"any", "all"}) {
+      std::vector<ExecBatch> batches{
+          ExecBatchFromJSON({int32()}, any_all_case.batches_json)};
+      std::vector<Aggregate> aggregates = {
+          Aggregate(func_name,
+                    std::make_shared<compute::ScalarAggregateOptions>(
+                        /*skip_nulls=*/any_all_case.skip_nulls,
+                        /*min_count=*/any_all_case.min_count),
+                    FieldRef("literal"))};
+
+      // And a projection to make the input including a Scalar Boolean
+      Declaration plan = Declaration::Sequence(
+          {{"exec_batch_source", ExecBatchSourceNodeOptions(in_schema, batches)},
+           {"project", ProjectNodeOptions({any_all_case.literal}, {"literal"})},
+           {"aggregate", AggregateNodeOptions(aggregates)}});
+
+      ASSERT_OK_AND_ASSIGN(BatchesWithCommonSchema out_batches,
+                           DeclarationToExecBatches(plan));
+
+      ExecBatch expected_batch =
+          ExecBatchFromJSON({boolean()}, any_all_case.expected_json);
+
+      AssertExecBatchesEqualIgnoringOrder(out_schema, {expected_batch},
+                                          out_batches.batches);
+    }
+  }
+}
+
 }  // namespace acero
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 1fbcd6a2490..c5e0e6fd6e9 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -532,13 +532,13 @@ struct BooleanAnyImpl : public ScalarAggregator {
     }
     if (batch[0].is_scalar()) {
       const Scalar& scalar = *batch[0].scalar;
-      this->has_nulls = !scalar.is_valid;
-      this->any = scalar.is_valid && checked_cast<const BooleanScalar&>(scalar).value;
-      this->count += scalar.is_valid;
+      this->has_nulls |= !scalar.is_valid;
+      this->any |= scalar.is_valid && checked_cast<const BooleanScalar&>(scalar).value;
+      this->count += scalar.is_valid * batch.length;
       return Status::OK();
     }
     const ArraySpan& data = batch[0].array;
-    this->has_nulls = data.GetNullCount() > 0;
+    this->has_nulls |= data.GetNullCount() > 0;
     this->count += data.length - data.GetNullCount();
     arrow::internal::OptionalBinaryBitBlockCounter counter(
         data.buffers[0].data, data.offset, data.buffers[1].data, data.offset,
@@ -603,13 +603,13 @@ struct BooleanAllImpl : public ScalarAggregator {
     }
     if (batch[0].is_scalar()) {
       const Scalar& scalar = *batch[0].scalar;
-      this->has_nulls = !scalar.is_valid;
-      this->count += scalar.is_valid;
-      this->all = !scalar.is_valid || checked_cast<const BooleanScalar&>(scalar).value;
+      this->has_nulls |= !scalar.is_valid;
+      this->count += scalar.is_valid * batch.length;
+      this->all &= !scalar.is_valid || checked_cast<const BooleanScalar&>(scalar).value;
       return Status::OK();
     }
     const ArraySpan& data = batch[0].array;
-    this->has_nulls = data.GetNullCount() > 0;
+    this->has_nulls |= data.GetNullCount() > 0;
     this->count += data.length - data.GetNullCount();
     arrow::internal::OptionalBinaryBitBlockCounter counter(
         data.buffers[1].data, data.offset, data.buffers[0].data, data.offset,

From 7d4cf37ce656581b123b04214f64ff44826dd83a Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 2 Sep 2024 18:54:51 +0200
Subject: [PATCH 107/157] GH-43883: [CI] Remove Python version guard when
 installing GCS testbench (#43884)

We can now use the GCS testbench even if we are testing a Python version that does not support it.

* GitHub Issue: #43883

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .github/workflows/cpp.yml                     |  2 +-
 ci/docker/conda-cpp.dockerfile                |  2 +-
 .../python-wheel-manylinux-test.dockerfile    | 14 ++++++++---
 ci/scripts/install_gcs_testbench.sh           | 25 +++++++++++--------
 dev/tasks/python-wheels/github.osx.yml        | 16 +++++++++++-
 5 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index fd23e0cf217..c3ca66719a5 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -472,7 +472,7 @@ jobs:
         shell: msys2 {0}
         env:
           PIPX_BIN_DIR: /usr/local/bin
-          PIPX_PYTHON: ${{ steps.python-install.outputs.python-path }}
+          PIPX_BASE_PYTHON: ${{ steps.python-install.outputs.python-path }}
         run: |
           ci/scripts/install_gcs_testbench.sh default
       - name: Test
diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile
index eb035d887a1..f0084894e19 100644
--- a/ci/docker/conda-cpp.dockerfile
+++ b/ci/docker/conda-cpp.dockerfile
@@ -44,7 +44,7 @@ RUN mamba install -q -y \
 
 # We want to install the GCS testbench using the Conda base environment's Python,
 # because the test environment's Python may later change.
-ENV PIPX_PYTHON=/opt/conda/bin/python3
+ENV PIPX_BASE_PYTHON=/opt/conda/bin/python3
 COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
 RUN /arrow/ci/scripts/install_gcs_testbench.sh default
 
diff --git a/ci/docker/python-wheel-manylinux-test.dockerfile b/ci/docker/python-wheel-manylinux-test.dockerfile
index 443ff9c53cb..09883f9780a 100644
--- a/ci/docker/python-wheel-manylinux-test.dockerfile
+++ b/ci/docker/python-wheel-manylinux-test.dockerfile
@@ -19,13 +19,19 @@ ARG arch
 ARG python_image_tag
 FROM ${arch}/python:${python_image_tag}
 
-# RUN pip install --upgrade pip
-
 # pandas doesn't provide wheel for aarch64 yet, so cache the compiled
 # test dependencies in a docker image
 COPY python/requirements-wheel-test.txt /arrow/python/
 RUN pip install -r /arrow/python/requirements-wheel-test.txt
 
+# Install the GCS testbench with the system Python
+RUN apt-get update -y -q && \
+    apt-get install -y -q \
+        build-essential \
+        python3-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
 COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/
-ARG python
-RUN PYTHON_VERSION=${python} /arrow/ci/scripts/install_gcs_testbench.sh default
+ENV PIPX_PYTHON=/usr/bin/python3 PIPX_PIP_ARGS=--prefer-binary
+RUN /arrow/ci/scripts/install_gcs_testbench.sh default
diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh
index 78826e94d32..48a5858a358 100755
--- a/ci/scripts/install_gcs_testbench.sh
+++ b/ci/scripts/install_gcs_testbench.sh
@@ -39,18 +39,21 @@ if [[ "${version}" -eq "default" ]]; then
   version="v0.39.0"
 fi
 
-: ${PIPX_PYTHON:=$(which python3)}
+# The Python to install pipx with
+: ${PIPX_BASE_PYTHON:=$(which python3)}
+# The Python to install the GCS testbench with
+: ${PIPX_PYTHON:=${PIPX_BASE_PYTHON:-$(which python3)}}
 
 export PIP_BREAK_SYSTEM_PACKAGES=1
-${PIPX_PYTHON} -m pip install -U pipx
+${PIPX_BASE_PYTHON} -m pip install -U pipx
 
-# This script is run with PYTHON undefined in some places,
-# but those only use older pythons.
-if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then
-  pipx_flags=--verbose
-  if [[ $(id -un) == "root" ]]; then
-    # Install globally as /root/.local/bin is typically not in $PATH
-    pipx_flags="${pipx_flags} --global"
-  fi
-  ${PIPX_PYTHON} -m pipx install ${pipx_flags} "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
+pipx_flags=(--verbose --python ${PIPX_PYTHON})
+if [[ $(id -un) == "root" ]]; then
+  # Install globally as /root/.local/bin is typically not in $PATH
+  pipx_flags+=(--global)
 fi
+if [[ -n "${PIPX_PIP_ARGS}" ]]; then
+  pipx_flags+=(--pip-args "'${PIPX_PIP_ARGS}'")
+fi
+${PIPX_BASE_PYTHON} -m pipx install ${pipx_flags[@]} \
+  "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml
index b26aeba32b7..a65bf9b56ad 100644
--- a/dev/tasks/python-wheels/github.osx.yml
+++ b/dev/tasks/python-wheels/github.osx.yml
@@ -113,6 +113,21 @@ jobs:
           name: wheel
           path: arrow/python/repaired_wheels/*.whl
 
+      # Use a well-known Python version for the GCS testbench, and avoid
+      # putting it in PATH.
+      - name: Set up Python for GCS testbench
+        uses: actions/setup-python@v5.1.1
+        id: gcs-python-install
+        with:
+          python-version: 3.12
+          update-environment: false
+
+      - name: Install GCS testbench
+        env:
+          PIPX_BIN_DIR: /usr/local/bin
+          PIPX_BASE_PYTHON: {{ '${{ steps.gcs-python-install.outputs.python-path }}' }}
+        run: arrow/ci/scripts/install_gcs_testbench.sh default
+
       - name: Test Wheel
         env:
           PYTEST_ADDOPTS: "-k 'not test_cancellation'"
@@ -121,7 +136,6 @@ jobs:
           source test-env/bin/activate
           pip install --upgrade pip wheel
           arch -{{ arch }} pip install -r arrow/python/requirements-wheel-test.txt
-          PYTHON_VERSION={{ python_version }} arch -{{ arch }} arrow/ci/scripts/install_gcs_testbench.sh default
           arch -{{ arch }} arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow
 
       {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }}

From 698e0416f1aa4cecc024ed6eb4ab7014375f4966 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 09:55:13 -0700
Subject: [PATCH 108/157] MINOR: [C#] Bump Grpc.Tools from 2.65.0 to 2.66.0 in
 /csharp (#43913)

Bumps [Grpc.Tools](https://github.com/grpc/grpc) from 2.65.0 to 2.66.0.
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a href="https://github.com/grpc/grpc/commits">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Grpc.Tools&package-manager=nuget&previous-version=2.65.0&new-version=2.66.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj  | 2 +-
 csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj
index 18708881849..ec438fde843 100644
--- a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj
+++ b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj
@@ -5,7 +5,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Grpc.Tools" Version="2.65.0" PrivateAssets="All" />
+    <PackageReference Include="Grpc.Tools" Version="2.66.0" PrivateAssets="All" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
index 9e1866f8416..afe7d391942 100644
--- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
+++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
@@ -7,7 +7,7 @@
   <ItemGroup>
     <PackageReference Include="Google.Protobuf" Version="3.27.3" />
     <PackageReference Include="Grpc.Net.Client" Version="2.65.0" />
-    <PackageReference Include="Grpc.Tools" Version="2.65.0" PrivateAssets="All" />
+    <PackageReference Include="Grpc.Tools" Version="2.66.0" PrivateAssets="All" />
     <PackageReference Include="System.Memory" Version="4.5.5" />
   </ItemGroup>
 

From fa2edd468c986f0deca0e0411b26a7d2058aa5d1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 09:55:48 -0700
Subject: [PATCH 109/157] MINOR: [C#] Bump Google.Protobuf from 3.27.3 to
 3.28.0 in /csharp (#43914)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[//]: # (dependabot-start)
⚠️  **Dependabot is rebasing this PR** ⚠️

Rebasing might not happen immediately, so don't worry if this takes some time.

Note: if you make any changes to this PR yourself, they will take precedence over the rebase.

---

[//]: # (dependabot-end)

Bumps [Google.Protobuf](https://github.com/protocolbuffers/protobuf) from 3.27.3 to 3.28.0.
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/439c42c735ae1efed57ab7771986f2a3c0b99319"><code>439c42c</code></a> Updating version.json and repo version numbers to: 28.0</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/c9454f40e91bef6187e618f4856ebea240985c81"><code>c9454f4</code></a> Remove <code>--copt=&quot;-Werror&quot;</code> from <code>.bazelrc</code> (<a href="https://redirect.github.com/protocolbuffers/protobuf/issues/18005">#18005</a>)</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/f5a1b178ad52c3e64da40caceaa4ca9e51045cb4"><code>f5a1b17</code></a> Move -Werror to our test/dev bazelrc files. (<a href="https://redirect.github.com/protocolbuffers/protobuf/issues/17938">#17938</a>)</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/0c9e14a9eb880747c94dc5eef31be73db7cf2526"><code>0c9e14a</code></a> Merge pull request <a href="https://redirect.github.com/protocolbuffers/protobuf/issues/17917">#17917</a> from thomasvl/patch_objc_to_28</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/6a6ebe4b1c50c5ab1b32f8e55c0a98797a565ecc"><code>6a6ebe4</code></a> Merge pull request <a href="https://redirect.github.com/protocolbuffers/protobuf/issues/17919">#17919</a> from protocolbuffers/28.x-202408221734</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/09ba2bb826c9fafa0f0f49af9cc52d6ce1a5fcdb"><code>09ba2bb</code></a> Updating version.json and repo version numbers to: 28.0-dev</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/e340f52e461bf2726acb9fd1e0c88a88762aaf87"><code>e340f52</code></a> Updating version.json and repo version numbers to: 28.0-rc3</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/b2764205e943d9bc912c4504d95117179e9b38e1"><code>b276420</code></a> [ObjC] Issue stderr warnings for deprecated generation options.</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/13f850d92a522b330ef9a665d38bc5b6647ea8f3"><code>13f850d</code></a> Merge pull request <a href="https://redirect.github.com/protocolbuffers/protobuf/issues/17913">#17913</a> from protocolbuffers/cp-compat-upgrade</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/6bf01c51a0b92278958f0169d330d64a08dbb4ec"><code>6bf01c5</code></a> Binary compatibility shims for GeneratedMessageV3, SingleFieldBuilderV3, Repe...</li>
<li>Additional commits viewable in <a href="https://github.com/protocolbuffers/protobuf/compare/v3.27.3...v3.28.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Google.Protobuf&package-manager=nuget&previous-version=3.27.3&new-version=3.28.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj       | 2 +-
 .../Apache.Arrow.Flight.TestWeb.csproj                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
index afe7d391942..bcfb813c114 100644
--- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
+++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
@@ -5,7 +5,7 @@
   </PropertyGroup>
   
   <ItemGroup>
-    <PackageReference Include="Google.Protobuf" Version="3.27.3" />
+    <PackageReference Include="Google.Protobuf" Version="3.28.0" />
     <PackageReference Include="Grpc.Net.Client" Version="2.65.0" />
     <PackageReference Include="Grpc.Tools" Version="2.66.0" PrivateAssets="All" />
     <PackageReference Include="System.Memory" Version="4.5.5" />
diff --git a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
index 14227e2c4eb..5ed7cc47d6a 100644
--- a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
+++ b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
@@ -5,7 +5,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Google.Protobuf" Version="3.27.3" />
+    <PackageReference Include="Google.Protobuf" Version="3.28.0" />
     <PackageReference Include="Grpc.AspNetCore" Version="2.65.0" />
   </ItemGroup>
 

From 3a6135b66d511296281c1389063bff060a8b83e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Tue, 3 Sep 2024 00:37:03 +0200
Subject: [PATCH 110/157] GH-40216: [CI][Packaging][Python] Upload pyarrow
 nightly wheels to scientific python channel on Anaconda (#43862)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

As discussed on the main issue is interesting for discoverability to have the wheels uploaded to the nightly channel.

### What changes are included in this PR?

Added macro to upload wheel to scientific python channel

### Are these changes tested?

Via archery

### Are there any user-facing changes?

No but nightly wheels will be available on scientific python channel
* GitHub Issue: #40216

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Jacob Wujciak-Jens <jacob@wujciak.de>
---
 dev/tasks/macros.jinja                     | 12 ++++++++++++
 dev/tasks/python-sdist/github.yml          |  1 +
 dev/tasks/python-wheels/github.linux.yml   |  1 +
 dev/tasks/python-wheels/github.osx.yml     |  1 +
 dev/tasks/python-wheels/github.windows.yml |  1 +
 dev/tasks/tasks.yml                        |  1 +
 6 files changed, 17 insertions(+)

diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja
index df55f32222e..63cb2fc6dd1 100644
--- a/dev/tasks/macros.jinja
+++ b/dev/tasks/macros.jinja
@@ -183,6 +183,18 @@ env:
   {% endif %}
 {% endmacro %}
 
+{%- macro github_upload_wheel_scientific_python(pattern) -%}
+  {%- if arrow.is_default_branch() -%}
+  - name: Upload wheel to Anaconda scientific-python
+    shell: bash
+    run: |
+      python3 -m pip install git+https://github.com/Anaconda-Platform/anaconda-client.git@1.12.3
+      anaconda -t ${CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN} upload --force -u scientific-python-nightly-wheels --label dev {{ pattern }}
+    env:
+      CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN: {{ '${{ secrets.CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN }}' }}
+  {% endif %}
+{% endmacro %}
+
 {%- macro azure_checkout_arrow() -%}
   - script: |
       git clone --no-checkout --branch {{ arrow.branch }} {{ arrow.remote }} arrow
diff --git a/dev/tasks/python-sdist/github.yml b/dev/tasks/python-sdist/github.yml
index ef36e358aa9..ce41f437946 100644
--- a/dev/tasks/python-sdist/github.yml
+++ b/dev/tasks/python-sdist/github.yml
@@ -43,3 +43,4 @@ jobs:
 
       {{ macros.github_upload_releases("arrow/python/dist/*.tar.gz")|indent }}
       {{ macros.github_upload_gemfury("arrow/python/dist/*.tar.gz")|indent }}
+      {{ macros.github_upload_wheel_scientific_python("arrow/python/dist/*.tar.gz")|indent }}
diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml
index 97746ba3f9b..f9df27ba317 100644
--- a/dev/tasks/python-wheels/github.linux.yml
+++ b/dev/tasks/python-wheels/github.linux.yml
@@ -110,6 +110,7 @@ jobs:
 
       {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }}
       {{ macros.github_upload_gemfury("arrow/python/repaired_wheels/*.whl")|indent }}
+      {{ macros.github_upload_wheel_scientific_python("arrow/python/repaired_wheels/*.whl")|indent }}
 
       {% if arrow.is_default_branch() %}
       - name: Push Docker Image
diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml
index a65bf9b56ad..98e06a14ff2 100644
--- a/dev/tasks/python-wheels/github.osx.yml
+++ b/dev/tasks/python-wheels/github.osx.yml
@@ -140,3 +140,4 @@ jobs:
 
       {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }}
       {{ macros.github_upload_gemfury("arrow/python/repaired_wheels/*.whl")|indent }}
+      {{ macros.github_upload_wheel_scientific_python("arrow/python/repaired_wheels/*.whl")|indent }}
diff --git a/dev/tasks/python-wheels/github.windows.yml b/dev/tasks/python-wheels/github.windows.yml
index a40b9c0d651..3a943b6ae51 100644
--- a/dev/tasks/python-wheels/github.windows.yml
+++ b/dev/tasks/python-wheels/github.windows.yml
@@ -71,6 +71,7 @@ jobs:
 
       {{ macros.github_upload_releases("arrow/python/dist/*.whl")|indent }}
       {{ macros.github_upload_gemfury("arrow/python/dist/*.whl")|indent }}
+      {{ macros.github_upload_wheel_scientific_python("arrow/python/dist/*.whl")|indent }}
 
       {% if arrow.is_default_branch() %}
       - name: Push Docker Image
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index c6d2f2175d4..b7e0c1601e3 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -29,6 +29,7 @@ groups:
 
   wheel:
     - wheel-*
+    - python-sdist
 
   linux:
     - almalinux-*

From 00d357674002b4e2e08b9d76b5d52530e723c4eb Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 3 Sep 2024 10:08:33 +0900
Subject: [PATCH 111/157] GH-43746: [C++] Add support for Boost 1.86 (#43766)

### Rationale for this change

`boost/process/*.hpp` are deprecated since Boost 1.86. And it seems that it also adds backward incompatible change. We need to use `boost/process/v2/*.hpp` instead.

### What changes are included in this PR?

This introduces `arrow::util::Process` for testing. It wraps boost/process/ API. So we don't need to use boost/process/ API directly in our tests.

We still use the v1 API on Windows because the v2 API doesn't process group and we don't have a workaround for it on Windows. If GCS's testbench doesn't use multiple processes, we can use the v2 API on Windows because we don't need to use process group in our use case.

See also:
* The v2 API and process group: https://github.com/boostorg/process/issues/259
* GCS's testbench and multiple processes: https://github.com/googleapis/storage-testbench/issues/669

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #43746

Lead-authored-by: Sutou Kouhei <kou@clear-code.com>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/cpp.yml                   |   4 +-
 .github/workflows/ruby.yml                  |   5 +-
 cpp/cmake_modules/ThirdpartyToolchain.cmake |  48 +++-
 cpp/src/arrow/CMakeLists.txt                |  13 +-
 cpp/src/arrow/filesystem/CMakeLists.txt     |  18 +-
 cpp/src/arrow/filesystem/azurefs_test.cc    |  52 +---
 cpp/src/arrow/filesystem/gcsfs_test.cc      | 106 +++----
 cpp/src/arrow/filesystem/s3_test_util.cc    |  69 +----
 cpp/src/arrow/flight/CMakeLists.txt         |   5 -
 cpp/src/arrow/flight/flight_benchmark.cc    |   2 +-
 cpp/src/arrow/flight/flight_test.cc         |   4 +-
 cpp/src/arrow/flight/test_util.cc           | 118 ++------
 cpp/src/arrow/flight/test_util.h            |  17 +-
 cpp/src/arrow/testing/process.cc            | 298 ++++++++++++++++++++
 cpp/src/arrow/testing/process.h             |  46 +++
 cpp/src/gandiva/precompiled/CMakeLists.txt  |  26 +-
 cpp/vcpkg.json                              |   2 +-
 17 files changed, 500 insertions(+), 333 deletions(-)
 create mode 100644 cpp/src/arrow/testing/process.cc
 create mode 100644 cpp/src/arrow/testing/process.h

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index c3ca66719a5..d51438c5f19 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -409,12 +409,10 @@ jobs:
       ARROW_WITH_SNAPPY: ON
       ARROW_WITH_ZLIB: ON
       ARROW_WITH_ZSTD: ON
-      # Don't use preinstalled Boost by empty BOOST_ROOT and
-      # -DBoost_NO_BOOST_CMAKE=ON
+      # Don't use preinstalled Boost by empty BOOST_ROOT
       BOOST_ROOT: ""
       ARROW_CMAKE_ARGS: >-
         -DARROW_PACKAGE_PREFIX=/${{ matrix.msystem_lower}}
-        -DBoost_NO_BOOST_CMAKE=ON
         -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON
       # We can't use unity build because we don't have enough memory on
       # GitHub Actions.
diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml
index e4d650e74a8..4b74b8d7fc8 100644
--- a/.github/workflows/ruby.yml
+++ b/.github/workflows/ruby.yml
@@ -406,7 +406,10 @@ jobs:
             -source "https://nuget.pkg.github.com/$GITHUB_REPOSITORY_OWNER/index.json"
       - name: Build C++ vcpkg dependencies
         run: |
-          vcpkg\vcpkg.exe install --triplet $env:VCPKG_TRIPLET --x-manifest-root cpp --x-install-root build\cpp\vcpkg_installed
+          vcpkg\vcpkg.exe install `
+            --triplet $env:VCPKG_TRIPLET `
+            --x-manifest-root cpp `
+            --x-install-root build\cpp\vcpkg_installed
       - name: Build C++
         shell: cmd
         run: |
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 63e2c036c9a..b31037a9732 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -259,7 +259,7 @@ macro(resolve_dependency DEPENDENCY_NAME)
       IS_RUNTIME_DEPENDENCY
       REQUIRED_VERSION
       USE_CONFIG)
-  set(multi_value_args COMPONENTS PC_PACKAGE_NAMES)
+  set(multi_value_args COMPONENTS OPTIONAL_COMPONENTS PC_PACKAGE_NAMES)
   cmake_parse_arguments(ARG
                         "${options}"
                         "${one_value_args}"
@@ -287,6 +287,9 @@ macro(resolve_dependency DEPENDENCY_NAME)
   if(ARG_COMPONENTS)
     list(APPEND FIND_PACKAGE_ARGUMENTS COMPONENTS ${ARG_COMPONENTS})
   endif()
+  if(ARG_OPTIONAL_COMPONENTS)
+    list(APPEND FIND_PACKAGE_ARGUMENTS OPTIONAL_COMPONENTS ${ARG_OPTIONAL_COMPONENTS})
+  endif()
   if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO")
     find_package(${FIND_PACKAGE_ARGUMENTS})
     set(COMPATIBLE ${${PACKAGE_NAME}_FOUND})
@@ -1289,15 +1292,19 @@ if(ARROW_USE_BOOST)
     set(Boost_USE_STATIC_LIBS ON)
   endif()
   if(ARROW_BOOST_REQUIRE_LIBRARY)
-    set(ARROW_BOOST_COMPONENTS system filesystem)
+    set(ARROW_BOOST_COMPONENTS filesystem system)
+    set(ARROW_BOOST_OPTIONAL_COMPONENTS process)
   else()
     set(ARROW_BOOST_COMPONENTS)
+    set(ARROW_BOOST_OPTIONAL_COMPONENTS)
   endif()
   resolve_dependency(Boost
                      REQUIRED_VERSION
                      ${ARROW_BOOST_REQUIRED_VERSION}
                      COMPONENTS
                      ${ARROW_BOOST_COMPONENTS}
+                     OPTIONAL_COMPONENTS
+                     ${ARROW_BOOST_OPTIONAL_COMPONENTS}
                      IS_RUNTIME_DEPENDENCY
                      # libarrow.so doesn't depend on libboost*.
                      FALSE)
@@ -1316,14 +1323,35 @@ if(ARROW_USE_BOOST)
     endif()
   endforeach()
 
-  if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    # boost/process/detail/windows/handle_workaround.hpp doesn't work
-    # without BOOST_USE_WINDOWS_H with MinGW because MinGW doesn't
-    # provide __kernel_entry without winternl.h.
-    #
-    # See also:
-    # https://github.com/boostorg/process/blob/develop/include/boost/process/detail/windows/handle_workaround.hpp
-    target_compile_definitions(Boost::headers INTERFACE "BOOST_USE_WINDOWS_H=1")
+  if(TARGET Boost::process)
+    # Boost >= 1.86
+    target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V1")
+    target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2")
+  else()
+    # Boost < 1.86
+    add_library(Boost::process INTERFACE IMPORTED)
+    if(TARGET Boost::filesystem)
+      target_link_libraries(Boost::process INTERFACE Boost::filesystem)
+    endif()
+    if(TARGET Boost::system)
+      target_link_libraries(Boost::process INTERFACE Boost::system)
+    endif()
+    if(TARGET Boost::headers)
+      target_link_libraries(Boost::process INTERFACE Boost::headers)
+    endif()
+    if(Boost_VERSION VERSION_GREATER_EQUAL 1.80)
+      target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2")
+      # Boost < 1.86 has a bug that
+      # boost::process::v2::process_environment::on_setup() isn't
+      # defined. We need to build Boost Process source to define it.
+      #
+      # See also:
+      # https://github.com/boostorg/process/issues/312
+      target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_NEED_SOURCE")
+      if(WIN32)
+        target_link_libraries(Boost::process INTERFACE bcrypt ntdll)
+      endif()
+    endif()
   endif()
 
   message(STATUS "Boost include dir: ${Boost_INCLUDE_DIRS}")
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 65343df1291..01ac813f471 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -644,9 +644,13 @@ else()
 endif()
 
 set(ARROW_TESTING_SHARED_LINK_LIBS arrow_shared ${ARROW_GTEST_GTEST})
-set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON)
-set(ARROW_TESTING_STATIC_LINK_LIBS arrow::flatbuffers RapidJSON arrow_static
-                                   ${ARROW_GTEST_GTEST})
+set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON Boost::process)
+set(ARROW_TESTING_STATIC_LINK_LIBS
+    arrow::flatbuffers
+    RapidJSON
+    Boost::process
+    arrow_static
+    ${ARROW_GTEST_GTEST})
 set(ARROW_TESTING_SHARED_INSTALL_INTERFACE_LIBS Arrow::arrow_shared)
 set(ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS Arrow::arrow_static)
 # that depend on gtest
@@ -667,9 +671,10 @@ set(ARROW_TESTING_SRCS
     io/test_common.cc
     ipc/test_common.cc
     testing/fixed_width_test_util.cc
+    testing/generator.cc
     testing/gtest_util.cc
+    testing/process.cc
     testing/random.cc
-    testing/generator.cc
     testing/util.cc)
 
 #
diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt
index dec4bb6e3d4..7afdf566f2f 100644
--- a/cpp/src/arrow/filesystem/CMakeLists.txt
+++ b/cpp/src/arrow/filesystem/CMakeLists.txt
@@ -47,9 +47,7 @@ if(ARROW_GCS)
                  EXTRA_LABELS
                  filesystem
                  EXTRA_LINK_LIBS
-                 google-cloud-cpp::storage
-                 Boost::filesystem
-                 Boost::system)
+                 google-cloud-cpp::storage)
 endif()
 
 if(ARROW_AZURE)
@@ -57,9 +55,7 @@ if(ARROW_AZURE)
                  EXTRA_LABELS
                  filesystem
                  EXTRA_LINK_LIBS
-                 ${AZURE_SDK_LINK_LIBRARIES}
-                 Boost::filesystem
-                 Boost::system)
+                 ${AZURE_SDK_LINK_LIBRARIES})
 endif()
 
 if(ARROW_S3)
@@ -75,11 +71,7 @@ if(ARROW_S3)
   else()
     list(APPEND ARROW_S3_TEST_EXTRA_LINK_LIBS arrow_static)
   endif()
-  list(APPEND
-       ARROW_S3_TEST_EXTRA_LINK_LIBS
-       ${AWSSDK_LINK_LIBRARIES}
-       Boost::filesystem
-       Boost::system)
+  list(APPEND ARROW_S3_TEST_EXTRA_LINK_LIBS ${AWSSDK_LINK_LIBRARIES})
   add_arrow_test(s3fs_test
                  SOURCES
                  s3fs_test.cc
@@ -122,9 +114,7 @@ if(ARROW_S3)
                         s3_test_util.cc
                         STATIC_LINK_LIBS
                         ${AWSSDK_LINK_LIBRARIES}
-                        ${ARROW_BENCHMARK_LINK_LIBS}
-                        Boost::filesystem
-                        Boost::system)
+                        ${ARROW_BENCHMARK_LINK_LIBS})
     if(ARROW_TEST_LINKAGE STREQUAL "static")
       target_link_libraries(arrow-filesystem-s3fs-benchmark PRIVATE parquet_static)
     else()
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc
index 9d437d1f83a..a8dc9234767 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -15,24 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <algorithm>  // Missing include in boost/process
-
-// This boost/asio/io_context.hpp include is needless for no MinGW
-// build.
-//
-// This is for including boost/asio/detail/socket_types.hpp before any
-// "#include <windows.h>". boost/asio/detail/socket_types.hpp doesn't
-// work if windows.h is already included. boost/process.h ->
-// boost/process/args.hpp -> boost/process/detail/basic_cmd.hpp
-// includes windows.h. boost/process/args.hpp is included before
-// boost/process/async.h that includes
-// boost/asio/detail/socket_types.hpp implicitly is included.
-#include <boost/asio/io_context.hpp>
-// We need BOOST_USE_WINDOWS_H definition with MinGW when we use
-// boost/process.hpp. See BOOST_USE_WINDOWS_H=1 in
-// cpp/cmake_modules/ThirdpartyToolchain.cmake for details.
-#include <boost/process.hpp>
-
 #include "arrow/filesystem/azurefs.h"
 #include "arrow/filesystem/azurefs_internal.h"
 
@@ -53,6 +35,7 @@
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/process.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/future.h"
 #include "arrow/util/io_util.h"
@@ -67,7 +50,6 @@ namespace arrow {
 using internal::TemporaryDir;
 namespace fs {
 using internal::ConcatAbstractPath;
-namespace bp = boost::process;
 
 using ::testing::IsEmpty;
 using ::testing::Not;
@@ -174,42 +156,32 @@ class AzuriteEnv : public AzureEnvImpl<AzuriteEnv> {
  private:
   std::unique_ptr<TemporaryDir> temp_dir_;
   arrow::internal::PlatformFilename debug_log_path_;
-  bp::child server_process_;
+  std::unique_ptr<util::Process> server_process_;
 
   using AzureEnvImpl::AzureEnvImpl;
 
  public:
   static const AzureBackend kBackend = AzureBackend::kAzurite;
 
-  ~AzuriteEnv() override {
-    server_process_.terminate();
-    server_process_.wait();
-  }
+  ~AzuriteEnv() = default;
 
   static Result<std::unique_ptr<AzureEnvImpl>> Make() {
     auto self = std::unique_ptr<AzuriteEnv>(
         new AzuriteEnv("devstoreaccount1",
                        "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/"
                        "K1SZFPTOtr/KBHBeksoGMGw=="));
-    auto exe_path = bp::search_path("azurite");
-    if (exe_path.empty()) {
-      return Status::Invalid("Could not find Azurite emulator.");
-    }
+    self->server_process_ = std::make_unique<util::Process>();
+    ARROW_RETURN_NOT_OK(self->server_process_->SetExecutable("azurite"));
     ARROW_ASSIGN_OR_RAISE(self->temp_dir_, TemporaryDir::Make("azurefs-test-"));
     ARROW_ASSIGN_OR_RAISE(self->debug_log_path_,
                           self->temp_dir_->path().Join("debug.log"));
-    auto server_process = bp::child(
-        boost::this_process::environment(), exe_path, "--silent", "--location",
-        self->temp_dir_->path().ToString(), "--debug", self->debug_log_path_.ToString(),
-        // For old Azurite. We can't install the latest Azurite with
-        // old Node.js on old Ubuntu.
-        "--skipApiVersionCheck");
-    if (!server_process.valid() || !server_process.running()) {
-      server_process.terminate();
-      server_process.wait();
-      return Status::Invalid("Could not start Azurite emulator.");
-    }
-    self->server_process_ = std::move(server_process);
+    self->server_process_->SetArgs({"--silent", "--location",
+                                    self->temp_dir_->path().ToString(), "--debug",
+                                    self->debug_log_path_.ToString(),
+                                    // For old Azurite. We can't install the latest
+                                    // Azurite with old Node.js on old Ubuntu.
+                                    "--skipApiVersionCheck"});
+    ARROW_RETURN_NOT_OK(self->server_process_->Execute());
     return self;
   }
 
diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc
index 2098cf4d7f3..d4d5edf4b89 100644
--- a/cpp/src/arrow/filesystem/gcsfs_test.cc
+++ b/cpp/src/arrow/filesystem/gcsfs_test.cc
@@ -15,26 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <algorithm>  // Missing include in boost/process
-
-#define BOOST_NO_CXX98_FUNCTION_BASE  // ARROW-17805
-// This boost/asio/io_context.hpp include is needless for no MinGW
-// build.
-//
-// This is for including boost/asio/detail/socket_types.hpp before any
-// "#include <windows.h>". boost/asio/detail/socket_types.hpp doesn't
-// work if windows.h is already included. boost/process.h ->
-// boost/process/args.hpp -> boost/process/detail/basic_cmd.hpp
-// includes windows.h. boost/process/args.hpp is included before
-// boost/process/async.h that includes
-// boost/asio/detail/socket_types.hpp implicitly is included.
-#include <boost/asio/io_context.hpp>
-// We need BOOST_USE_WINDOWS_H definition with MinGW when we use
-// boost/process.hpp. See BOOST_USE_WINDOWS_H=1 in
-// cpp/cmake_modules/ThirdpartyToolchain.cmake for details.
-#include <boost/process.hpp>
-#include <boost/thread.hpp>
-
 #include "arrow/filesystem/gcsfs.h"
 
 #include <absl/time/time.h>
@@ -45,16 +25,15 @@
 #include <google/cloud/storage/options.h>
 #include <gtest/gtest.h>
 
-#include <array>
 #include <random>
 #include <string>
-#include <thread>
 
 #include "arrow/filesystem/gcsfs_internal.h"
 #include "arrow/filesystem/path_util.h"
 #include "arrow/filesystem/test_util.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/matchers.h"
+#include "arrow/testing/process.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/future.h"
 #include "arrow/util/key_value_metadata.h"
@@ -64,7 +43,6 @@ namespace fs {
 
 namespace {
 
-namespace bp = boost::process;
 namespace gc = google::cloud;
 namespace gcs = google::cloud::storage;
 
@@ -89,70 +67,62 @@ class GcsTestbench : public ::testing::Environment {
  public:
   GcsTestbench() {
     port_ = std::to_string(GetListenPort());
-    std::vector<std::string> names{"python3", "python"};
-    // If the build script or application developer provides a value in the PYTHON
-    // environment variable, then just use that.
-    if (const auto* env = std::getenv("PYTHON")) {
-      names = {env};
-    }
     auto error = std::string("Could not start GCS emulator 'storage-testbench'");
+    auto server_process = std::make_unique<util::Process>();
+    auto status = server_process->SetExecutable("storage-testbench");
+    if (!status.ok()) {
+      error += ": " + status.ToString();
+      error_ = std::move(error);
+      return;
+    }
 
-    auto testbench_is_running = [](bp::child& process, bp::ipstream& output) {
-      // Wait for message: "* Restarting with"
-      std::string line;
+    server_process->SetArgs({"--port", port_});
+    server_process->IgnoreStderr();
+    status = server_process->Execute();
+    if (!status.ok()) {
+      error += ": " + status.ToString();
+      error_ = std::move(error);
+      return;
+    }
+
+    auto testbench_is_running = [&server_process, this]() {
+      auto ready_timeout = std::chrono::seconds(10);
       std::chrono::time_point<std::chrono::steady_clock> end =
-          std::chrono::steady_clock::now() + std::chrono::seconds(10);
-      while (process.valid() && process.running() &&
-             std::chrono::steady_clock::now() < end) {
-        if (output.peek() && std::getline(output, line)) {
-          std::cerr << line << std::endl;
-          if (line.find("* Restarting with") != std::string::npos) return true;
-        } else {
-          std::this_thread::sleep_for(std::chrono::milliseconds(20));
+          std::chrono::steady_clock::now() + ready_timeout;
+      while (server_process->IsRunning() && std::chrono::steady_clock::now() < end) {
+        auto client = gcs::Client(
+            google::cloud::Options{}
+                .set<gcs::RestEndpointOption>("http://127.0.0.1:" + port_)
+                .set<gc::UnifiedCredentialsOption>(gc::MakeInsecureCredentials())
+                .set<gcs::RetryPolicyOption>(
+                    gcs::LimitedTimeRetryPolicy(ready_timeout).clone()));
+        auto metadata = client.GetBucketMetadata("nonexistent");
+        if (metadata.status().code() == google::cloud::StatusCode::kNotFound) {
+          return true;
         }
       }
       return false;
     };
 
-    auto exe_path = bp::search_path("storage-testbench");
-    if (!exe_path.empty()) {
-      bp::ipstream output;
-      server_process_ =
-          bp::child(exe_path, "--port", port_, group_, bp::std_err > output);
-      if (!testbench_is_running(server_process_, output)) {
-        error += " (failed to start)";
-        server_process_.terminate();
-        server_process_.wait();
-      }
-    } else {
-      error += " (exe not found)";
-    }
-    if (!server_process_.valid()) {
+    if (!testbench_is_running()) {
+      error += " (failed to listen)";
       error_ = std::move(error);
+      return;
     }
+
+    server_process_ = std::move(server_process);
   }
 
-  bool running() { return server_process_.running(); }
+  bool running() { return server_process_ && server_process_->IsRunning(); }
 
-  ~GcsTestbench() override {
-    // Brutal shutdown, kill the full process group because the GCS testbench may launch
-    // additional children.
-    try {
-      group_.terminate();
-    } catch (bp::process_error&) {
-    }
-    if (server_process_.valid()) {
-      server_process_.wait();
-    }
-  }
+  ~GcsTestbench() = default;
 
   const std::string& port() const { return port_; }
   const std::string& error() const { return error_; }
 
  private:
   std::string port_;
-  bp::child server_process_;
-  bp::group group_;
+  std::unique_ptr<util::Process> server_process_;
   std::string error_;
 };
 
diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc
index eb29a677dae..003afa68f1e 100644
--- a/cpp/src/arrow/filesystem/s3_test_util.cc
+++ b/cpp/src/arrow/filesystem/s3_test_util.cc
@@ -15,33 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <algorithm>  // Missing include in boost/process
-
 #ifndef _WIN32
 #include <sys/wait.h>
 #endif
 
-// This boost/asio/io_context.hpp include is needless for no MinGW
-// build.
-//
-// This is for including boost/asio/detail/socket_types.hpp before any
-// "#include <windows.h>". boost/asio/detail/socket_types.hpp doesn't
-// work if windows.h is already included. boost/process.h ->
-// boost/process/args.hpp -> boost/process/detail/basic_cmd.hpp
-// includes windows.h. boost/process/args.hpp is included before
-// boost/process/async.h that includes
-// boost/asio/detail/socket_types.hpp implicitly is included.
-#ifdef __MINGW32__
-#include <boost/asio/io_context.hpp>
-#endif
-#define BOOST_NO_CXX98_FUNCTION_BASE  // ARROW-17805
-// We need BOOST_USE_WINDOWS_H definition with MinGW when we use
-// boost/process.hpp. See BOOST_USE_WINDOWS_H=1 in
-// cpp/cmake_modules/ThirdpartyToolchain.cmake for details.
-#include <boost/process.hpp>
-
 #include "arrow/filesystem/s3_test_util.h"
 #include "arrow/filesystem/s3fs.h"
+#include "arrow/testing/process.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/async_generator.h"
 #include "arrow/util/future.h"
@@ -53,8 +33,6 @@ namespace fs {
 
 using ::arrow::internal::TemporaryDir;
 
-namespace bp = boost::process;
-
 namespace {
 
 const char* kMinioExecutableName = "minio";
@@ -75,7 +53,7 @@ struct MinioTestServer::Impl {
   std::string connect_string_;
   std::string access_key_ = kMinioAccessKey;
   std::string secret_key_ = kMinioSecretKey;
-  std::shared_ptr<::boost::process::child> server_process_;
+  std::unique_ptr<util::Process> server_process_;
 };
 
 MinioTestServer::MinioTestServer() : impl_(new Impl) {}
@@ -105,44 +83,23 @@ Status MinioTestServer::Start() {
 
   ARROW_ASSIGN_OR_RAISE(impl_->temp_dir_, TemporaryDir::Make("s3fs-test-"));
 
-  // Get a copy of the current environment.
-  // (NOTE: using "auto" would return a native_environment that mutates
-  //  the current environment)
-  bp::environment env = boost::this_process::environment();
-  env["MINIO_ACCESS_KEY"] = kMinioAccessKey;
-  env["MINIO_SECRET_KEY"] = kMinioSecretKey;
+  impl_->server_process_ = std::make_unique<util::Process>();
+  impl_->server_process_->SetEnv("MINIO_ACCESS_KEY", kMinioAccessKey);
+  impl_->server_process_->SetEnv("MINIO_SECRET_KEY", kMinioSecretKey);
   // Disable the embedded console (one less listening address to care about)
-  env["MINIO_BROWSER"] = "off";
-
+  impl_->server_process_->SetEnv("MINIO_BROWSER", "off");
   impl_->connect_string_ = GenerateConnectString();
-  auto exe_path = bp::search_path(kMinioExecutableName);
-  if (exe_path.empty()) {
-    return Status::IOError("Failed to find minio executable ('", kMinioExecutableName,
-                           "') in PATH");
-  }
-
-  try {
-    // NOTE: --quiet makes startup faster by suppressing remote version check
-    impl_->server_process_ = std::make_shared<bp::child>(
-        env, exe_path, "server", "--quiet", "--compat", "--address",
-        impl_->connect_string_, impl_->temp_dir_->path().ToString());
-  } catch (const std::exception& e) {
-    return Status::IOError("Failed to launch Minio server: ", e.what());
-  }
+  ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName));
+  // NOTE: --quiet makes startup faster by suppressing remote version check
+  impl_->server_process_->SetArgs({"server", "--quiet", "--compat", "--address",
+                                   impl_->connect_string_,
+                                   impl_->temp_dir_->path().ToString()});
+  ARROW_RETURN_NOT_OK(impl_->server_process_->Execute());
   return Status::OK();
 }
 
 Status MinioTestServer::Stop() {
-  if (impl_->server_process_ && impl_->server_process_->valid()) {
-    // Brutal shutdown
-    impl_->server_process_->terminate();
-    impl_->server_process_->wait();
-#ifndef _WIN32
-    // Despite calling wait() above, boost::process fails to clear zombies
-    // so do it ourselves.
-    waitpid(impl_->server_process_->id(), nullptr, 0);
-#endif
-  }
+  impl_->server_process_ = nullptr;
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt
index 835c4fc83bf..b12476ac389 100644
--- a/cpp/src/arrow/flight/CMakeLists.txt
+++ b/cpp/src/arrow/flight/CMakeLists.txt
@@ -70,11 +70,6 @@ if(ARROW_BUILD_BENCHMARKS
     endif()
   endif()
 endif()
-list(APPEND
-     ARROW_FLIGHT_TEST_INTERFACE_LIBS
-     Boost::headers
-     Boost::filesystem
-     Boost::system)
 list(APPEND ARROW_FLIGHT_TEST_LINK_LIBS gRPC::grpc++)
 
 # TODO(wesm): Protobuf shared vs static linking
diff --git a/cpp/src/arrow/flight/flight_benchmark.cc b/cpp/src/arrow/flight/flight_benchmark.cc
index 057ef15c3c7..661c47737f0 100644
--- a/cpp/src/arrow/flight/flight_benchmark.cc
+++ b/cpp/src/arrow/flight/flight_benchmark.cc
@@ -491,7 +491,7 @@ int main(int argc, char** argv) {
         if (FLAGS_cuda && FLAGS_test_put) {
           server_args.push_back("-cuda");
         }
-        server->Start(server_args);
+        ABORT_NOT_OK(server->Start(server_args));
       }
       std::cout << "Server host: " << FLAGS_server_host << std::endl
                 << "Server port: " << FLAGS_server_port << std::endl;
diff --git a/cpp/src/arrow/flight/flight_test.cc b/cpp/src/arrow/flight/flight_test.cc
index 3d52bc3f5ae..6425233dade 100644
--- a/cpp/src/arrow/flight/flight_test.cc
+++ b/cpp/src/arrow/flight/flight_test.cc
@@ -204,7 +204,7 @@ ARROW_FLIGHT_TEST_ASYNC_CLIENT(GrpcAsyncClientTest);
 
 TEST(TestFlight, ConnectUri) {
   TestServer server("flight-test-server");
-  server.Start();
+  ASSERT_OK(server.Start());
   ASSERT_TRUE(server.IsRunning());
 
   std::stringstream ss;
@@ -230,7 +230,7 @@ TEST(TestFlight, InvalidUriScheme) {
 #ifndef _WIN32
 TEST(TestFlight, ConnectUriUnix) {
   TestServer server("flight-test-server", "/tmp/flight-test.sock");
-  server.Start();
+  ASSERT_OK(server.Start());
   ASSERT_TRUE(server.IsRunning());
 
   std::stringstream ss;
diff --git a/cpp/src/arrow/flight/test_util.cc b/cpp/src/arrow/flight/test_util.cc
index 127827ff38c..aa10d9a7da8 100644
--- a/cpp/src/arrow/flight/test_util.cc
+++ b/cpp/src/arrow/flight/test_util.cc
@@ -17,11 +17,6 @@
 
 #include "arrow/flight/test_util.h"
 
-#ifdef __APPLE__
-#include <limits.h>
-#include <mach-o/dyld.h>
-#endif
-
 #include <algorithm>
 #include <cstdlib>
 #include <fstream>
@@ -31,18 +26,13 @@
 #include "arrow/util/windows_compatibility.h"
 
 #include <gtest/gtest.h>
-#include <boost/filesystem.hpp>
-#define BOOST_NO_CXX98_FUNCTION_BASE  // ARROW-17805
-// We need BOOST_USE_WINDOWS_H definition with MinGW when we use
-// boost/process.hpp. See BOOST_USE_WINDOWS_H=1 in
-// cpp/cmake_modules/ThirdpartyToolchain.cmake for details.
-#include <boost/process.hpp>
 
 #include "arrow/array.h"
 #include "arrow/array/builder_primitive.h"
 #include "arrow/ipc/test_common.h"
 #include "arrow/testing/generator.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/process.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/logging.h"
 
@@ -51,101 +41,27 @@
 
 namespace arrow::flight {
 
-namespace bp = boost::process;
-namespace fs = boost::filesystem;
-
-namespace {
-
-Status ResolveCurrentExecutable(fs::path* out) {
-  // See https://stackoverflow.com/a/1024937/10194 for various
-  // platform-specific recipes.
-
-  boost::system::error_code ec;
-
-#if defined(__linux__)
-  *out = fs::canonical("/proc/self/exe", ec);
-#elif defined(__APPLE__)
-  char buf[PATH_MAX + 1];
-  uint32_t bufsize = sizeof(buf);
-  if (_NSGetExecutablePath(buf, &bufsize) < 0) {
-    return Status::Invalid("Can't resolve current exe: path too large");
-  }
-  *out = fs::canonical(buf, ec);
-#elif defined(_WIN32)
-  char buf[MAX_PATH + 1];
-  if (!GetModuleFileNameA(NULL, buf, sizeof(buf))) {
-    return Status::Invalid("Can't get executable file path");
-  }
-  *out = fs::canonical(buf, ec);
-#else
-  ARROW_UNUSED(ec);
-  return Status::NotImplemented("Not available on this system");
-#endif
-  if (ec) {
-    // XXX fold this into the Status class?
-    return Status::IOError("Can't resolve current exe: ", ec.message());
+Status TestServer::Start(const std::vector<std::string>& extra_args) {
+  server_process_ = std::make_unique<util::Process>();
+  ARROW_RETURN_NOT_OK(server_process_->SetExecutable(executable_name_));
+  std::vector<std::string> args = {};
+  if (unix_sock_.empty()) {
+    args.push_back("-port");
+    args.push_back(std::to_string(port_));
   } else {
-    return Status::OK();
-  }
-}
-
-}  // namespace
-
-void TestServer::Start(const std::vector<std::string>& extra_args) {
-  namespace fs = boost::filesystem;
-
-  std::string str_port = std::to_string(port_);
-  std::vector<fs::path> search_path = ::boost::this_process::path();
-  // If possible, prepend current executable directory to search path,
-  // since it's likely that the test server executable is located in
-  // the same directory as the running unit test.
-  fs::path current_exe;
-  Status st = ResolveCurrentExecutable(&current_exe);
-  if (st.ok()) {
-    search_path.insert(search_path.begin(), current_exe.parent_path());
-  } else if (st.IsNotImplemented()) {
-    ARROW_CHECK(st.IsNotImplemented()) << st.ToString();
-  }
-
-  try {
-    if (unix_sock_.empty()) {
-      server_process_ =
-          std::make_shared<bp::child>(bp::search_path(executable_name_, search_path),
-                                      "-port", str_port, bp::args(extra_args));
-    } else {
-      server_process_ =
-          std::make_shared<bp::child>(bp::search_path(executable_name_, search_path),
-                                      "-server_unix", unix_sock_, bp::args(extra_args));
-    }
-  } catch (...) {
-    std::stringstream ss;
-    ss << "Failed to launch test server '" << executable_name_ << "', looked in ";
-    for (const auto& path : search_path) {
-      ss << path << " : ";
-    }
-    ARROW_LOG(FATAL) << ss.str();
-    throw;
+    args.push_back("-server_unix");
+    args.push_back(unix_sock_);
   }
-  std::cout << "Server running with pid " << server_process_->id() << std::endl;
+  args.insert(args.end(), extra_args.begin(), extra_args.end());
+  server_process_->SetArgs(args);
+  ARROW_RETURN_NOT_OK(server_process_->Execute());
+  std::cout << "Server running with pid " << server_process_->pid() << std::endl;
+  return Status::OK();
 }
 
-int TestServer::Stop() {
-  if (server_process_ && server_process_->valid()) {
-#ifndef _WIN32
-    kill(server_process_->id(), SIGTERM);
-#else
-    // This would use SIGKILL on POSIX, which is more brutal than SIGTERM
-    server_process_->terminate();
-#endif
-    server_process_->wait();
-    return server_process_->exit_code();
-  } else {
-    // Presumably the server wasn't able to start
-    return -1;
-  }
-}
+void TestServer::Stop() { server_process_ = nullptr; }
 
-bool TestServer::IsRunning() { return server_process_->running(); }
+bool TestServer::IsRunning() { return server_process_->IsRunning(); }
 
 int TestServer::port() const { return port_; }
 
diff --git a/cpp/src/arrow/flight/test_util.h b/cpp/src/arrow/flight/test_util.h
index 15ba6145ecd..946caebcc2b 100644
--- a/cpp/src/arrow/flight/test_util.h
+++ b/cpp/src/arrow/flight/test_util.h
@@ -29,6 +29,7 @@
 
 #include "arrow/status.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/process.h"
 #include "arrow/testing/util.h"
 
 #include "arrow/flight/client.h"
@@ -36,14 +37,6 @@
 #include "arrow/flight/types.h"
 #include "arrow/flight/visibility.h"
 
-namespace boost {
-namespace process {
-
-class child;
-
-}  // namespace process
-}  // namespace boost
-
 namespace arrow {
 namespace flight {
 
@@ -76,10 +69,10 @@ class ARROW_FLIGHT_EXPORT TestServer {
   TestServer(const std::string& executable_name, const std::string& unix_sock)
       : executable_name_(executable_name), unix_sock_(unix_sock) {}
 
-  void Start(const std::vector<std::string>& extra_args);
-  void Start() { Start({}); }
+  Status Start(const std::vector<std::string>& extra_args);
+  Status Start() { return Start({}); }
 
-  int Stop();
+  void Stop();
 
   bool IsRunning();
 
@@ -90,7 +83,7 @@ class ARROW_FLIGHT_EXPORT TestServer {
   std::string executable_name_;
   int port_;
   std::string unix_sock_;
-  std::shared_ptr<::boost::process::child> server_process_;
+  std::unique_ptr<util::Process> server_process_;
 };
 
 // Helper to initialize a server and matching client with callbacks to
diff --git a/cpp/src/arrow/testing/process.cc b/cpp/src/arrow/testing/process.cc
new file mode 100644
index 00000000000..32da81f1463
--- /dev/null
+++ b/cpp/src/arrow/testing/process.cc
@@ -0,0 +1,298 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/testing/process.h"
+#include "arrow/result.h"
+
+// This boost/asio/io_context.hpp include is needless for no MinGW
+// build.
+//
+// This is for including boost/asio/detail/socket_types.hpp before any
+// "#include <windows.h>". boost/asio/detail/socket_types.hpp doesn't
+// work if windows.h is already included.
+#include <boost/asio/io_context.hpp>
+
+#ifdef BOOST_PROCESS_HAVE_V2
+// We can't use v2 API on Windows because v2 API doesn't support
+// process group [1] and GCS testbench uses multiple processes [2].
+//
+// [1] https://github.com/boostorg/process/issues/259
+// [2] https://github.com/googleapis/storage-testbench/issues/669
+#ifndef _WIN32
+#define BOOST_PROCESS_USE_V2
+#endif
+#endif
+
+#ifdef BOOST_PROCESS_USE_V2
+#ifdef BOOST_PROCESS_NEED_SOURCE
+// Workaround for https://github.com/boostorg/process/issues/312
+#define BOOST_PROCESS_V2_SEPARATE_COMPILATION
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#endif
+#include <boost/process/v2.hpp>
+#include <boost/process/v2/src.hpp>
+#else
+#include <boost/process/v2.hpp>
+#endif
+#include <unordered_map>
+#else
+// We need BOOST_USE_WINDOWS_H definition with MinGW when we use
+// boost/process.hpp. boost/process/detail/windows/handle_workaround.hpp
+// doesn't work without BOOST_USE_WINDOWS_H with MinGW because MinGW
+// doesn't provide __kernel_entry without winternl.h.
+//
+// See also:
+// https://github.com/boostorg/process/blob/develop/include/boost/process/detail/windows/handle_workaround.hpp
+#ifdef __MINGW32__
+#define BOOST_USE_WINDOWS_H = 1
+#endif
+#ifdef BOOST_PROCESS_HAVE_V1
+#include <boost/process/v1.hpp>
+#else
+#include <boost/process.hpp>
+#endif
+#endif
+
+#ifdef __APPLE__
+#include <limits.h>
+#include <mach-o/dyld.h>
+#endif
+
+#include <chrono>
+#include <iostream>
+#include <sstream>
+#include <thread>
+
+#ifdef BOOST_PROCESS_USE_V2
+namespace asio = BOOST_PROCESS_V2_ASIO_NAMESPACE;
+namespace process = BOOST_PROCESS_V2_NAMESPACE;
+namespace filesystem = process::filesystem;
+#elif defined(BOOST_PROCESS_HAVE_V1)
+namespace process = boost::process::v1;
+namespace filesystem = boost::process::v1::filesystem;
+#else
+namespace process = boost::process;
+namespace filesystem = boost::filesystem;
+#endif
+
+namespace arrow::util {
+
+class Process::Impl {
+ public:
+  Impl() {
+    // Get a copy of the current environment.
+#ifdef BOOST_PROCESS_USE_V2
+    for (const auto& kv : process::environment::current()) {
+      env_[kv.key()] = process::environment::value(kv.value());
+    }
+#else
+    env_ = process::environment(boost::this_process::environment());
+#endif
+  }
+
+  ~Impl() {
+#ifdef BOOST_PROCESS_USE_V2
+    // V2 doesn't provide process group support yet:
+    // https://github.com/boostorg/process/issues/259
+    //
+    // So we try graceful shutdown (SIGTERM + waitpid()) before
+    // immediate shutdown (SIGKILL). This assumes that the target
+    // executable such as "python3 -m testbench" terminates all related
+    // processes by graceful shutdown.
+    boost::system::error_code error_code;
+    if (process_ && process_->running(error_code)) {
+      process_->request_exit(error_code);
+      if (!error_code) {
+        auto timeout = std::chrono::seconds(3);
+        std::chrono::time_point<std::chrono::steady_clock> end =
+            std::chrono::steady_clock::now() + timeout;
+        while (process_->running(error_code) && std::chrono::steady_clock::now() < end) {
+          std::this_thread::sleep_for(std::chrono::milliseconds(20));
+        }
+      }
+    }
+#else
+    process_group_ = nullptr;
+#endif
+    process_ = nullptr;
+  }
+
+  Status SetExecutable(const std::string& name) {
+#ifdef BOOST_PROCESS_USE_V2
+    executable_ = process::environment::find_executable(name);
+#else
+    executable_ = process::search_path(name);
+#endif
+    if (executable_.empty()) {
+      // Search the current executable directory as fallback.
+      ARROW_ASSIGN_OR_RAISE(auto current_exe, ResolveCurrentExecutable());
+#ifdef BOOST_PROCESS_USE_V2
+      std::unordered_map<process::environment::key, process::environment::value> env;
+      for (const auto& kv : process::environment::current()) {
+        env[kv.key()] = process::environment::value(kv.value());
+      }
+      env["PATH"] = process::environment::value(current_exe.parent_path());
+      executable_ = process::environment::find_executable(name, env);
+#else
+      executable_ = process::search_path(name, {current_exe.parent_path()});
+#endif
+    }
+    if (executable_.empty()) {
+      return Status::IOError("Failed to find '", name, "' in PATH");
+    }
+    return Status::OK();
+  }
+
+  void SetArgs(const std::vector<std::string>& args) { args_ = args; }
+
+  void SetEnv(const std::string& name, const std::string& value) {
+#ifdef BOOST_PROCESS_USE_V2
+    env_[name] = process::environment::value(value);
+#else
+    env_[name] = value;
+#endif
+  }
+
+  void IgnoreStderr() { keep_stderr_ = false; }
+
+  Status Execute() {
+    try {
+#ifdef BOOST_PROCESS_USE_V2
+      return ExecuteV2();
+#else
+      return ExecuteV1();
+#endif
+    } catch (const std::exception& e) {
+      return Status::IOError("Failed to launch '", executable_, "': ", e.what());
+    }
+  }
+
+  bool IsRunning() {
+#ifdef BOOST_PROCESS_USE_V2
+    boost::system::error_code error_code;
+    return process_ && process_->running(error_code);
+#else
+    return process_ && process_->running();
+#endif
+  }
+
+  uint64_t pid() {
+    if (!process_) {
+      return 0;
+    }
+    return process_->id();
+  }
+
+ private:
+  filesystem::path executable_;
+  std::vector<std::string> args_;
+  bool keep_stderr_ = true;
+#ifdef BOOST_PROCESS_USE_V2
+  std::unordered_map<process::environment::key, process::environment::value> env_;
+  std::unique_ptr<process::process> process_;
+  asio::io_context ctx_;
+  // boost/process/v2/ doesn't support process group yet:
+  // https://github.com/boostorg/process/issues/259
+#else
+  process::environment env_;
+  std::unique_ptr<process::child> process_;
+  std::unique_ptr<process::group> process_group_;
+#endif
+
+  Result<filesystem::path> ResolveCurrentExecutable() {
+    // See https://stackoverflow.com/a/1024937/10194 for various
+    // platform-specific recipes.
+
+    filesystem::path path;
+    boost::system::error_code error_code;
+
+#if defined(__linux__)
+    path = filesystem::canonical("/proc/self/exe", error_code);
+#elif defined(__APPLE__)
+    char buf[PATH_MAX + 1];
+    uint32_t bufsize = sizeof(buf);
+    if (_NSGetExecutablePath(buf, &bufsize) < 0) {
+      return Status::Invalid("Can't resolve current exe: path too large");
+    }
+    path = filesystem::canonical(buf, error_code);
+#elif defined(_WIN32)
+    char buf[MAX_PATH + 1];
+    if (!GetModuleFileNameA(NULL, buf, sizeof(buf))) {
+      return Status::Invalid("Can't get executable file path");
+    }
+    path = filesystem::canonical(buf, error_code);
+#else
+    ARROW_UNUSED(error_code);
+    return Status::NotImplemented("Not available on this system");
+#endif
+    if (error_code) {
+      // XXX fold this into the Status class?
+      return Status::IOError("Can't resolve current exe: ", error_code.message());
+    } else {
+      return path;
+    }
+  }
+
+#ifdef BOOST_PROCESS_USE_V2
+  Status ExecuteV2() {
+    process::process_environment env(env_);
+    // We can't use std::make_unique<process::process>.
+    process_ = std::unique_ptr<process::process>(
+        new process::process(ctx_, executable_, args_, env,
+                             keep_stderr_ ? process::process_stdio{{}, {}, {}}
+                                          : process::process_stdio{{}, {}, nullptr}));
+    return Status::OK();
+  }
+#else
+  Status ExecuteV1() {
+    process_group_ = std::make_unique<process::group>();
+    if (keep_stderr_) {
+      process_ = std::make_unique<process::child>(executable_, process::args(args_), env_,
+                                                  *process_group_);
+    } else {
+      process_ = std::make_unique<process::child>(executable_, process::args(args_), env_,
+                                                  *process_group_,
+                                                  process::std_err > process::null);
+    }
+    return Status::OK();
+  }
+#endif
+};
+
+Process::Process() : impl_(new Impl()) {}
+
+Process::~Process() {}
+
+Status Process::SetExecutable(const std::string& path) {
+  return impl_->SetExecutable(path);
+}
+
+void Process::SetArgs(const std::vector<std::string>& args) { impl_->SetArgs(args); }
+
+void Process::SetEnv(const std::string& key, const std::string& value) {
+  impl_->SetEnv(key, value);
+}
+
+void Process::IgnoreStderr() { impl_->IgnoreStderr(); }
+
+Status Process::Execute() { return impl_->Execute(); }
+
+bool Process::IsRunning() { return impl_->IsRunning(); }
+
+uint64_t Process::pid() { return impl_->pid(); }
+}  // namespace arrow::util
diff --git a/cpp/src/arrow/testing/process.h b/cpp/src/arrow/testing/process.h
new file mode 100644
index 00000000000..d4d2ae124f4
--- /dev/null
+++ b/cpp/src/arrow/testing/process.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/testing/visibility.h"
+
+namespace arrow::util {
+
+class ARROW_TESTING_EXPORT Process {
+ public:
+  Process();
+  ~Process();
+
+  Status SetExecutable(const std::string& path);
+  void SetArgs(const std::vector<std::string>& args);
+  void SetEnv(const std::string& name, const std::string& value);
+  void IgnoreStderr();
+  Status Execute();
+  bool IsRunning();
+  uint64_t pid();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+}  // namespace arrow::util
diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt
index c092ff4fd01..c2bc7fc0279 100644
--- a/cpp/src/gandiva/precompiled/CMakeLists.txt
+++ b/cpp/src/gandiva/precompiled/CMakeLists.txt
@@ -53,8 +53,8 @@ add_custom_target(precompiled ALL DEPENDS ${GANDIVA_PRECOMPILED_BC_PATH}
                                           ${GANDIVA_PRECOMPILED_CC_PATH})
 
 # testing
-if(ARROW_BUILD_TESTS)
-  add_executable(gandiva-precompiled-test
+add_gandiva_test(precompiled-test
+                 SOURCES
                  ../context_helper.cc
                  bitmap_test.cc
                  bitmap.cc
@@ -75,16 +75,12 @@ if(ARROW_BUILD_TESTS)
                  decimal_ops_test.cc
                  decimal_ops.cc
                  ../decimal_type_util.cc
-                 ../decimal_xlarge.cc)
-  target_include_directories(gandiva-precompiled-test PRIVATE ${CMAKE_SOURCE_DIR}/src)
-  target_link_libraries(gandiva-precompiled-test PRIVATE ${ARROW_TEST_LINK_LIBS}
-                                                         Boost::headers)
-  target_compile_definitions(gandiva-precompiled-test PRIVATE GANDIVA_UNIT_TEST=1
-                                                              ARROW_STATIC GANDIVA_STATIC)
-  set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/gandiva-precompiled-test")
-  add_test(gandiva-precompiled-test ${TEST_PATH})
-  set_property(TEST gandiva-precompiled-test
-               APPEND
-               PROPERTY LABELS "unittest;gandiva-tests")
-  add_dependencies(gandiva-tests gandiva-precompiled-test)
-endif()
+                 ../decimal_xlarge.cc
+                 EXTRA_INCLUDES
+                 ${CMAKE_SOURCE_DIR}/src
+                 EXTRA_LINK_LIBS
+                 Boost::headers
+                 DEFINITIONS
+                 GANDIVA_UNIT_TEST=1
+                 ARROW_STATIC
+                 GANDIVA_STATIC)
diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json
index 6f825b55cfd..103e678ebb4 100644
--- a/cpp/vcpkg.json
+++ b/cpp/vcpkg.json
@@ -15,11 +15,11 @@
       ]
     },
     "benchmark",
+    "boost-crc",
     "boost-filesystem",
     "boost-multiprecision",
     "boost-process",
     "boost-system",
-    "boost-crc",
     "brotli",
     "bzip2",
     "c-ares",

From 589ab7aca8179a749eeef091884bebc12700f168 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 10:13:17 +0900
Subject: [PATCH 112/157] MINOR: [CI] Bump actions/setup-python from 5.1.1 to
 5.2.0 (#43917)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5.1.1 to 5.2.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/actions/setup-python/releases">actions/setup-python's releases</a>.</em></p>
<blockquote>
<h2>v5.2.0</h2>
<h2>What's Changed</h2>
<h3>Bug fixes:</h3>
<ul>
<li>Add <code>.zip</code> extension to Windows package downloads for <code>Expand-Archive</code> Compatibility by <a href="https://github.com/priyagupta108"><code>@​priyagupta108</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/916">actions/setup-python#916</a>
This addresses compatibility issues on Windows self-hosted runners by ensuring that the filenames for Python and PyPy package downloads explicitly include the .zip extension, allowing the Expand-Archive command to function correctly.</li>
<li>Add arch to cache key by <a href="https://github.com/Zxilly"><code>@​Zxilly</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/896">actions/setup-python#896</a>
This addresses issues with caching by adding the architecture (arch) to the cache key, ensuring that cache keys are accurate to prevent conflicts</li>
</ul>
<h3>Documentation changes:</h3>
<ul>
<li>Fix display of emojis in contributors doc by <a href="https://github.com/sciencewhiz"><code>@​sciencewhiz</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/899">actions/setup-python#899</a></li>
<li>Documentation update for caching poetry dependencies by <a href="https://github.com/gowridurgad"><code>@​gowridurgad</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/908">actions/setup-python#908</a></li>
</ul>
<h3>Dependency updates:</h3>
<ul>
<li>Bump <code>@​iarna/toml</code> version from 2.2.5 to 3.0.0 by <a href="https://github.com/priya-kinthali"><code>@​priya-kinthali</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/912">actions/setup-python#912</a></li>
<li>Bump pyinstaller from 3.6 to 5.13.1 by <a href="https://github.com/aparnajyothi-y"><code>@​aparnajyothi-y</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/923">actions/setup-python#923</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/sciencewhiz"><code>@​sciencewhiz</code></a> made their first contribution in <a href="https://redirect.github.com/actions/setup-python/pull/899">actions/setup-python#899</a></li>
<li><a href="https://github.com/priyagupta108"><code>@​priyagupta108</code></a> made their first contribution in <a href="https://redirect.github.com/actions/setup-python/pull/916">actions/setup-python#916</a></li>
<li><a href="https://github.com/Zxilly"><code>@​Zxilly</code></a> made their first contribution in <a href="https://redirect.github.com/actions/setup-python/pull/896">actions/setup-python#896</a></li>
<li><a href="https://github.com/aparnajyothi-y"><code>@​aparnajyothi-y</code></a> made their first contribution in <a href="https://redirect.github.com/actions/setup-python/pull/923">actions/setup-python#923</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/setup-python/compare/v5...v5.2.0">https://github.com/actions/setup-python/compare/v5...v5.2.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/actions/setup-python/commit/f677139bbe7f9c59b41e40162b753c062f5d49a3"><code>f677139</code></a> Bump pyinstaller from 3.6 to 5.13.1 in /<strong>tests</strong>/data (<a href="https://redirect.github.com/actions/setup-python/issues/923">#923</a>)</li>
<li><a href="https://github.com/actions/setup-python/commit/2bd53f9a4d1dd1cd21eaffcc01a7b91a8e73ea4c"><code>2bd53f9</code></a> Documentation update for caching poetry dependencies (<a href="https://redirect.github.com/actions/setup-python/issues/908">#908</a>)</li>
<li><a href="https://github.com/actions/setup-python/commit/80b49d3ed89312896dbdcbefc2ddb159c7f8ca43"><code>80b49d3</code></a> fix: add arch to cache key (<a href="https://redirect.github.com/actions/setup-python/issues/896">#896</a>)</li>
<li><a href="https://github.com/actions/setup-python/commit/036a5236741fd24c89eea80d1b76179e8e5f9214"><code>036a523</code></a> Fix: Add <code>.zip</code> extension to Windows package downloads for <code>Expand-Archive</code> C...</li>
<li><a href="https://github.com/actions/setup-python/commit/04c1311429f7be71707d8ab66c7af8a14e54b938"><code>04c1311</code></a> Fix display of emojis in contributors doc (<a href="https://redirect.github.com/actions/setup-python/issues/899">#899</a>)</li>
<li><a href="https://github.com/actions/setup-python/commit/cb6845644151e35f879e10f2f0896c3c8bee372c"><code>cb68456</code></a> Updated <code>@​iarna/toml</code> version to 3.0.0 (<a href="https://redirect.github.com/actions/setup-python/issues/912">#912</a>)</li>
<li>See full diff in <a href="https://github.com/actions/setup-python/compare/v5.1.1...v5.2.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/setup-python&package-manager=github_actions&previous-version=5.1.1&new-version=5.2.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/archery.yml      | 2 +-
 .github/workflows/comment_bot.yml  | 2 +-
 .github/workflows/cpp.yml          | 4 ++--
 .github/workflows/csharp.yml       | 2 +-
 .github/workflows/dev.yml          | 4 ++--
 .github/workflows/docs.yml         | 2 +-
 .github/workflows/docs_light.yml   | 2 +-
 .github/workflows/go.yml           | 6 +++---
 .github/workflows/integration.yml  | 2 +-
 .github/workflows/java.yml         | 2 +-
 .github/workflows/java_jni.yml     | 4 ++--
 .github/workflows/java_nightly.yml | 2 +-
 .github/workflows/js.yml           | 2 +-
 .github/workflows/pr_bot.yml       | 2 +-
 .github/workflows/python.yml       | 4 ++--
 .github/workflows/r.yml            | 4 ++--
 .github/workflows/r_nightly.yml    | 2 +-
 .github/workflows/ruby.yml         | 2 +-
 18 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml
index b016f7d11b9..2c460710109 100644
--- a/.github/workflows/archery.yml
+++ b/.github/workflows/archery.yml
@@ -58,7 +58,7 @@ jobs:
         shell: bash
         run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true
       - name: Setup Python
-        uses: actions/setup-python@v5.1.1
+        uses: actions/setup-python@v5.2.0
         with:
           python-version: '3.9'
       - name: Install pygit2 binary wheel
diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml
index 1138c0a02f8..b7af4c58008 100644
--- a/.github/workflows/comment_bot.yml
+++ b/.github/workflows/comment_bot.yml
@@ -41,7 +41,7 @@ jobs:
           # fetch the tags for version number generation
           fetch-depth: 0
       - name: Set up Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Install Archery and Crossbow dependencies
diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index d51438c5f19..20bcfcb38da 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -243,7 +243,7 @@ jobs:
           $(brew --prefix bash)/bin/bash \
             ci/scripts/install_minio.sh latest ${ARROW_HOME}
       - name: Set up Python
-        uses: actions/setup-python@v5.1.1
+        uses: actions/setup-python@v5.2.0
         with:
           python-version: 3.12
       - name: Install Google Cloud Storage Testbench
@@ -462,7 +462,7 @@ jobs:
             https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z
           chmod +x /usr/local/bin/minio.exe
       - name: Set up Python
-        uses: actions/setup-python@v5.1.1
+        uses: actions/setup-python@v5.2.0
         id: python-install
         with:
           python-version: 3.9
diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml
index 6e8548dc960..c618350affb 100644
--- a/.github/workflows/csharp.yml
+++ b/.github/workflows/csharp.yml
@@ -108,7 +108,7 @@ jobs:
         with:
           dotnet-version: ${{ matrix.dotnet }}
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Checkout Arrow
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
index cc3ff633074..1cc8d993498 100644
--- a/.github/workflows/dev.yml
+++ b/.github/workflows/dev.yml
@@ -45,7 +45,7 @@ jobs:
         with:
           fetch-depth: 0
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Install pre-commit
@@ -104,7 +104,7 @@ jobs:
         with:
           fetch-depth: 0
       - name: Install Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: '3.12'
       - name: Install Ruby
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 25db1c39ad8..1219f7526f9 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -52,7 +52,7 @@ jobs:
           key: debian-docs-${{ hashFiles('cpp/**') }}
           restore-keys: debian-docs-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Setup Archery
diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml
index ea7fe5d02d7..454affd7fa7 100644
--- a/.github/workflows/docs_light.yml
+++ b/.github/workflows/docs_light.yml
@@ -58,7 +58,7 @@ jobs:
           key: conda-docs-${{ hashFiles('cpp/**') }}
           restore-keys: conda-docs-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Setup Archery
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index b9a19d182d5..9b18b010a0c 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -207,7 +207,7 @@ jobs:
           fetch-depth: 0
           submodules: recursive
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
@@ -247,7 +247,7 @@ jobs:
         with:
           fetch-depth: 0
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
@@ -339,7 +339,7 @@ jobs:
           github.event_name == 'push' &&
           github.repository == 'apache/arrow' &&
           github.ref_name == 'main'
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: '3.10'
       - name: Run Benchmarks
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index 43f8af0a600..3a6b568c520 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -89,7 +89,7 @@ jobs:
           key: conda-${{ hashFiles('cpp/**') }}
           restore-keys: conda-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml
index 0317879b580..8560f0dd1cb 100644
--- a/.github/workflows/java.yml
+++ b/.github/workflows/java.yml
@@ -76,7 +76,7 @@ jobs:
           key: maven-${{ hashFiles('java/**') }}
           restore-keys: maven-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml
index c2bc679e681..f204d6459ae 100644
--- a/.github/workflows/java_jni.yml
+++ b/.github/workflows/java_jni.yml
@@ -70,7 +70,7 @@ jobs:
           key: java-jni-manylinux-2014-${{ hashFiles('cpp/**', 'java/**') }}
           restore-keys: java-jni-manylinux-2014-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
@@ -110,7 +110,7 @@ jobs:
           key: maven-${{ hashFiles('java/**') }}
           restore-keys: maven-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml
index 72afb6dbf1c..0bf0c27288f 100644
--- a/.github/workflows/java_nightly.yml
+++ b/.github/workflows/java_nightly.yml
@@ -58,7 +58,7 @@ jobs:
           repository: ursacomputing/crossbow
           ref: main
       - name: Set up Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           cache: 'pip'
           python-version: 3.12
diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml
index 630bef61105..4ab9831924f 100644
--- a/.github/workflows/js.yml
+++ b/.github/workflows/js.yml
@@ -54,7 +54,7 @@ jobs:
         with:
           fetch-depth: 0
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml
index 7dd06b6aeec..bbb1a2d7228 100644
--- a/.github/workflows/pr_bot.yml
+++ b/.github/workflows/pr_bot.yml
@@ -82,7 +82,7 @@ jobs:
           # fetch the tags for version number generation
           fetch-depth: 0
       - name: Set up Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Install Archery and Crossbow dependencies
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 90d3a50af37..b88ea7ce4f1 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -107,7 +107,7 @@ jobs:
           key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }}
           restore-keys: ${{ matrix.cache }}-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
@@ -177,7 +177,7 @@ jobs:
           fetch-depth: 0
           submodules: recursive
       - name: Setup Python
-        uses: actions/setup-python@v5.1.1
+        uses: actions/setup-python@v5.2.0
         with:
           python-version: '3.11'
       - name: Install Dependencies
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 2820d42470b..21afa4586b5 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -146,7 +146,7 @@ jobs:
             ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-
             ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
@@ -206,7 +206,7 @@ jobs:
           fetch-depth: 0
           submodules: recursive
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml
index 1ec071b6bbb..9817e41d3b6 100644
--- a/.github/workflows/r_nightly.yml
+++ b/.github/workflows/r_nightly.yml
@@ -60,7 +60,7 @@ jobs:
           repository: ursacomputing/crossbow
           ref: main
       - name: Set up Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           cache: 'pip'
           python-version: 3.12
diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml
index 4b74b8d7fc8..228bacb77e5 100644
--- a/.github/workflows/ruby.yml
+++ b/.github/workflows/ruby.yml
@@ -83,7 +83,7 @@ jobs:
           key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }}
           restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery

From 4ed5a149695644fe364466eabcae38d8dabfc090 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 3 Sep 2024 10:13:48 +0900
Subject: [PATCH 113/157] GH-43797: [C++] Attach `arrow::ArrayStatistics` to
 `arrow::ArrayData` (#43801)

### Rationale for this change

If we can attach associated statistics to an array via `ArrayData`, we can use it in later processes such as query planning.

If `ArrayData` not `Array` has statistics, we can use statistics in computing kernels.

There was a concern that associated `arrow::ArrayStatistics` may be outdated if `arrow::ArrayData` is mutated after attaching `arrow::ArrayStatistics`. But `arrow::ArrayData` isn't mutable after the first population. So `arrow::ArrayStatistics` will not be outdated. We can require mutators to take responsibility for statistics.

### What changes are included in this PR?

* Add `arrow::ArrayData::statistics`
* Add `arrow::Array::statistics()` to get statistics attached in `arrow::ArrayData`

This doesn't provide a new `arrow::ArrayData` constructor (`arrow::ArrayData::Make()`) that accepts `arrow::ArrayStatistics`. We can change `arrow::ArrayData::statistics` after we create `arrow::ArrayData`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.

`arrow::Array::statistics()` is a new public API.
* GitHub Issue: #43797

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/src/arrow/array/array_base.h  |   8 ++
 cpp/src/arrow/array/array_test.cc | 126 ++++++++++++++++++++++++++++++
 cpp/src/arrow/array/data.cc       |   3 +
 cpp/src/arrow/array/data.h        |  24 +++++-
 4 files changed, 159 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h
index 716ae072206..e4af67d7e5f 100644
--- a/cpp/src/arrow/array/array_base.h
+++ b/cpp/src/arrow/array/array_base.h
@@ -232,6 +232,14 @@ class ARROW_EXPORT Array {
   /// \return DeviceAllocationType
   DeviceAllocationType device_type() const { return data_->device_type(); }
 
+  /// \brief Return the statistics of this Array
+  ///
+  /// This just delegates to calling statistics on the underlying ArrayData
+  /// object which backs this Array.
+  ///
+  /// \return const ArrayStatistics&
+  std::shared_ptr<ArrayStatistics> statistics() const { return data_->statistics; }
+
  protected:
   Array() = default;
   ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index 32806d9d2ed..73e0c692432 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -3709,6 +3709,132 @@ TEST(TestSwapEndianArrayData, InvalidLength) {
   }
 }
 
+class TestArrayDataStatistics : public ::testing::Test {
+ public:
+  void SetUp() {
+    valids_ = {1, 0, 1, 1};
+    null_count_ = std::count(valids_.begin(), valids_.end(), 0);
+    null_buffer_ = *internal::BytesToBits(valids_);
+    values_ = {1, 0, 3, -4};
+    min_ = *std::min_element(values_.begin(), values_.end());
+    max_ = *std::max_element(values_.begin(), values_.end());
+    values_buffer_ = Buffer::FromVector(values_);
+    data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_},
+                            null_count_);
+    data_->statistics = std::make_shared<ArrayStatistics>();
+    data_->statistics->null_count = null_count_;
+    data_->statistics->min = min_;
+    data_->statistics->is_min_exact = true;
+    data_->statistics->max = max_;
+    data_->statistics->is_max_exact = true;
+  }
+
+ protected:
+  std::vector<uint8_t> valids_;
+  size_t null_count_;
+  std::shared_ptr<Buffer> null_buffer_;
+  std::vector<int32_t> values_;
+  int64_t min_;
+  int64_t max_;
+  std::shared_ptr<Buffer> values_buffer_;
+  std::shared_ptr<ArrayData> data_;
+};
+
+TEST_F(TestArrayDataStatistics, MoveConstructor) {
+  ArrayData copied_data(*data_);
+  ArrayData moved_data(std::move(copied_data));
+
+  ASSERT_TRUE(moved_data.statistics->null_count.has_value());
+  ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
+
+  ASSERT_TRUE(moved_data.statistics->min.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
+  ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
+  ASSERT_TRUE(moved_data.statistics->is_min_exact);
+
+  ASSERT_TRUE(moved_data.statistics->max.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
+  ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
+  ASSERT_TRUE(moved_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, CopyConstructor) {
+  ArrayData copied_data(*data_);
+
+  ASSERT_TRUE(copied_data.statistics->null_count.has_value());
+  ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
+
+  ASSERT_TRUE(copied_data.statistics->min.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
+  ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
+  ASSERT_TRUE(copied_data.statistics->is_min_exact);
+
+  ASSERT_TRUE(copied_data.statistics->max.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
+  ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
+  ASSERT_TRUE(copied_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, MoveAssignment) {
+  ArrayData copied_data(*data_);
+  ArrayData moved_data;
+  moved_data = std::move(copied_data);
+
+  ASSERT_TRUE(moved_data.statistics->null_count.has_value());
+  ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
+
+  ASSERT_TRUE(moved_data.statistics->min.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
+  ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
+  ASSERT_TRUE(moved_data.statistics->is_min_exact);
+
+  ASSERT_TRUE(moved_data.statistics->max.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
+  ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
+  ASSERT_TRUE(moved_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, CopyAssignment) {
+  ArrayData copied_data;
+  copied_data = *data_;
+
+  ASSERT_TRUE(copied_data.statistics->null_count.has_value());
+  ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
+
+  ASSERT_TRUE(copied_data.statistics->min.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
+  ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
+  ASSERT_TRUE(copied_data.statistics->is_min_exact);
+
+  ASSERT_TRUE(copied_data.statistics->max.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
+  ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
+  ASSERT_TRUE(copied_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, CopyTo) {
+  ASSERT_OK_AND_ASSIGN(auto copied_data,
+                       data_->CopyTo(arrow::default_cpu_memory_manager()));
+
+  ASSERT_TRUE(copied_data->statistics->null_count.has_value());
+  ASSERT_EQ(null_count_, copied_data->statistics->null_count.value());
+
+  ASSERT_TRUE(copied_data->statistics->min.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->min.value()));
+  ASSERT_EQ(min_, std::get<int64_t>(copied_data->statistics->min.value()));
+  ASSERT_TRUE(copied_data->statistics->is_min_exact);
+
+  ASSERT_TRUE(copied_data->statistics->max.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->max.value()));
+  ASSERT_EQ(max_, std::get<int64_t>(copied_data->statistics->max.value()));
+  ASSERT_TRUE(copied_data->statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, Slice) {
+  auto sliced_data = data_->Slice(0, 1);
+  ASSERT_FALSE(sliced_data->statistics);
+}
+
 template <typename PType>
 class TestPrimitiveArray : public ::testing::Test {
  public:
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index 83eeb56c496..8e29297a8c1 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -165,6 +165,8 @@ Result<std::shared_ptr<ArrayData>> CopyToImpl(const ArrayData& data,
     ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to, copy_fn));
   }
 
+  output->statistics = data.statistics;
+
   return output;
 }
 }  // namespace
@@ -195,6 +197,7 @@ std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
   } else {
     copy->null_count = null_count != 0 ? kUnknownNullCount : 0;
   }
+  copy->statistics = nullptr;
   return copy;
 }
 
diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index e0508fe6980..1e6ee9a1d32 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -24,6 +24,7 @@
 #include <utility>
 #include <vector>
 
+#include "arrow/array/statistics.h"
 #include "arrow/buffer.h"
 #include "arrow/result.h"
 #include "arrow/type.h"
@@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData {
         offset(other.offset),
         buffers(std::move(other.buffers)),
         child_data(std::move(other.child_data)),
-        dictionary(std::move(other.dictionary)) {
+        dictionary(std::move(other.dictionary)),
+        statistics(std::move(other.statistics)) {
     SetNullCount(other.null_count);
   }
 
@@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData {
         offset(other.offset),
         buffers(other.buffers),
         child_data(other.child_data),
-        dictionary(other.dictionary) {
+        dictionary(other.dictionary),
+        statistics(other.statistics) {
     SetNullCount(other.null_count);
   }
 
@@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData {
     buffers = std::move(other.buffers);
     child_data = std::move(other.child_data);
     dictionary = std::move(other.dictionary);
+    statistics = std::move(other.statistics);
     return *this;
   }
 
@@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData {
     buffers = other.buffers;
     child_data = other.child_data;
     dictionary = other.dictionary;
+    statistics = other.statistics;
     return *this;
   }
 
@@ -274,6 +279,18 @@ struct ARROW_EXPORT ArrayData {
   }
 
   /// \brief Construct a zero-copy slice of the data with the given offset and length
+  ///
+  /// The associated `ArrayStatistics` is always discarded in a sliced
+  /// `ArrayData`. Because `ArrayStatistics` in the original
+  /// `ArrayData` may be invalid in a sliced `ArrayData`. If you want
+  /// to reuse statistics in the original `ArrayData`, you need to do
+  /// it by yourself.
+  ///
+  /// If the specified slice range has the same range as the original
+  /// `ArrayData`, we can reuse statistics in the original
+  /// `ArrayData`. Because it has the same data as the original
+  /// `ArrayData`. But the associated `ArrayStatistics` is discarded
+  /// in this case too. Use `Copy()` instead for the case.
   std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
 
   /// \brief Input-checking variant of Slice
@@ -390,6 +407,9 @@ struct ARROW_EXPORT ArrayData {
 
   // The dictionary for this Array, if any. Only used for dictionary type
   std::shared_ptr<ArrayData> dictionary;
+
+  // The statistics for this Array.
+  std::shared_ptr<ArrayStatistics> statistics;
 };
 
 /// \brief A non-owning Buffer reference

From 1475bd815bbdcd2bbcc6d6e74a7d8df5fe369ea5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 14:58:59 +0900
Subject: [PATCH 114/157] MINOR: [Java] Bump org.mockito:mockito-junit-jupiter
 from 5.12.0 to 5.13.0 in /java (#43919)

Bumps [org.mockito:mockito-junit-jupiter](https://github.com/mockito/mockito) from 5.12.0 to 5.13.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/mockito/mockito/releases">org.mockito:mockito-junit-jupiter's releases</a>.</em></p>
<blockquote>
<h2>v5.13.0</h2>
<p><em>Changelog generated by <a href="https://github.com/shipkit/shipkit-changelog">Shipkit Changelog Gradle Plugin</a></em></p>
<h4>5.13.0</h4>
<ul>
<li>2024-08-27 - <a href="https://github.com/mockito/mockito/compare/v5.12.0...v5.13.0">43 commit(s)</a> by Breno A, Caleb Cushing, Jinwoo, Kurt Alfred Kluever, Stefano Cordio, Thach Le, dependabot[bot]</li>
<li>Bump versions.bytebuddy from 1.14.19 to 1.15.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3429">#3429</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3429">mockito/mockito#3429</a>)</li>
<li>Bump org.jetbrains.kotlin:kotlin-stdlib from 2.0.10 to 2.0.20 [(<a href="https://redirect.github.com/mockito/mockito/issues/3427">#3427</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3427">mockito/mockito#3427</a>)</li>
<li>Bump org.junit.platform:junit-platform-launcher from 1.10.3 to 1.11.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3425">#3425</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3425">mockito/mockito#3425</a>)</li>
<li>Bump com.gradle.enterprise from 3.17.6 to 3.18 [(<a href="https://redirect.github.com/mockito/mockito/issues/3423">#3423</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3423">mockito/mockito#3423</a>)</li>
<li>Fix a typo in InjectMocks [(<a href="https://redirect.github.com/mockito/mockito/issues/3422">#3422</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3422">mockito/mockito#3422</a>)</li>
<li>Bump versions.bytebuddy from 1.14.18 to 1.14.19 [(<a href="https://redirect.github.com/mockito/mockito/issues/3417">#3417</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3417">mockito/mockito#3417</a>)</li>
<li>Bump androidx.test:runner from 1.6.1 to 1.6.2 [(<a href="https://redirect.github.com/mockito/mockito/issues/3415">#3415</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3415">mockito/mockito#3415</a>)</li>
<li>Bump versions.junitJupiter from 5.10.3 to 5.11.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3413">#3413</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3413">mockito/mockito#3413</a>)</li>
<li>Bump org.jetbrains.kotlin:kotlin-stdlib from 2.0.0 to 2.0.10 [(<a href="https://redirect.github.com/mockito/mockito/issues/3409">#3409</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3409">mockito/mockito#3409</a>)</li>
<li>Bump org.hamcrest:hamcrest-core from 2.2 to 3.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3408">#3408</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3408">mockito/mockito#3408</a>)</li>
<li>Bump com.google.googlejavaformat:google-java-format from 1.22.0 to 1.23.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3407">#3407</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3407">mockito/mockito#3407</a>)</li>
<li>Bump org.shipkit:shipkit-auto-version from 2.0.9 to 2.0.10 [(<a href="https://redirect.github.com/mockito/mockito/issues/3405">#3405</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3405">mockito/mockito#3405</a>)</li>
<li>Bump com.gradle.enterprise from 3.17.5 to 3.17.6 [(<a href="https://redirect.github.com/mockito/mockito/issues/3404">#3404</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3404">mockito/mockito#3404</a>)</li>
<li>Bump gradle/wrapper-validation-action from 3.4.2 to 3.5.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3401">#3401</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3401">mockito/mockito#3401</a>)</li>
<li>Bump org.assertj:assertj-core from 3.26.0 to 3.26.3 [(<a href="https://redirect.github.com/mockito/mockito/issues/3398">#3398</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3398">mockito/mockito#3398</a>)</li>
<li>Bump versions.bytebuddy from 1.14.17 to 1.14.18 [(<a href="https://redirect.github.com/mockito/mockito/issues/3397">#3397</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3397">mockito/mockito#3397</a>)</li>
<li>ci: add .m2 dependencies cache [(<a href="https://redirect.github.com/mockito/mockito/issues/3396">#3396</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3396">mockito/mockito#3396</a>)</li>
<li>Bump org.codehaus.groovy:groovy from 3.0.21 to 3.0.22 [(<a href="https://redirect.github.com/mockito/mockito/issues/3394">#3394</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3394">mockito/mockito#3394</a>)</li>
<li>Bump androidx.test:runner from 1.6.0 to 1.6.1 [(<a href="https://redirect.github.com/mockito/mockito/issues/3393">#3393</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3393">mockito/mockito#3393</a>)</li>
<li>Bump org.junit.platform:junit-platform-launcher from 1.10.2 to 1.10.3 [(<a href="https://redirect.github.com/mockito/mockito/issues/3392">#3392</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3392">mockito/mockito#3392</a>)</li>
<li>Gradle lazy configuration [(<a href="https://redirect.github.com/mockito/mockito/issues/3391">#3391</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3391">mockito/mockito#3391</a>)</li>
<li>Bump androidx.test.ext:junit from 1.2.0 to 1.2.1 [(<a href="https://redirect.github.com/mockito/mockito/issues/3388">#3388</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3388">mockito/mockito#3388</a>)</li>
<li>docs: cleanup javadoc for modularity [(<a href="https://redirect.github.com/mockito/mockito/issues/3386">#3386</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3386">mockito/mockito#3386</a>)</li>
<li>Bump versions.junitJupiter from 5.10.2 to 5.10.3 [(<a href="https://redirect.github.com/mockito/mockito/issues/3385">#3385</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3385">mockito/mockito#3385</a>)</li>
<li>Bump androidx.test.ext:junit from 1.1.5 to 1.2.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3383">#3383</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3383">mockito/mockito#3383</a>)</li>
<li>Bump androidx.test:runner from 1.5.2 to 1.6.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3382">#3382</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3382">mockito/mockito#3382</a>)</li>
<li>Bump net.ltgt.gradle:gradle-errorprone-plugin from 4.0.0 to 4.0.1 [(<a href="https://redirect.github.com/mockito/mockito/issues/3380">#3380</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3380">mockito/mockito#3380</a>)</li>
<li>Bump gradle/wrapper-validation-action from 3.4.1 to 3.4.2 [(<a href="https://redirect.github.com/mockito/mockito/issues/3376">#3376</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3376">mockito/mockito#3376</a>)</li>
<li>Bump gradle/wrapper-validation-action from 3.4.0 to 3.4.1 [(<a href="https://redirect.github.com/mockito/mockito/issues/3372">#3372</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3372">mockito/mockito#3372</a>)</li>
<li>Bump gradle/wrapper-validation-action from 3.3.2 to 3.4.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3365">#3365</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3365">mockito/mockito#3365</a>)</li>
<li>Bump org.shipkit:shipkit-auto-version from 2.0.7 to 2.0.9 [(<a href="https://redirect.github.com/mockito/mockito/issues/3364">#3364</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3364">mockito/mockito#3364</a>)</li>
<li>Bump com.gradle.enterprise from 3.17.4 to 3.17.5 [(<a href="https://redirect.github.com/mockito/mockito/issues/3363">#3363</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3363">mockito/mockito#3363</a>)</li>
<li>Bump org.eclipse.platform:org.eclipse.osgi from 3.19.0 to 3.20.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3362">#3362</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3362">mockito/mockito#3362</a>)</li>
<li>Bump net.ltgt.gradle:gradle-errorprone-plugin from 3.1.0 to 4.0.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3361">#3361</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3361">mockito/mockito#3361</a>)</li>
<li>Bump versions.bytebuddy from 1.14.16 to 1.14.17 [(<a href="https://redirect.github.com/mockito/mockito/issues/3357">#3357</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3357">mockito/mockito#3357</a>)</li>
<li>Bump org.assertj:assertj-core from 3.25.3 to 3.26.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3355">#3355</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3355">mockito/mockito#3355</a>)</li>
<li>EditorConfig enhancement [(<a href="https://redirect.github.com/mockito/mockito/issues/3353">#3353</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3353">mockito/mockito#3353</a>)</li>
<li>Bump versions.bytebuddy from 1.14.15 to 1.14.16 [(<a href="https://redirect.github.com/mockito/mockito/issues/3352">#3352</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3352">mockito/mockito#3352</a>)</li>
<li>Bump org.jetbrains.kotlin:kotlin-stdlib from 1.9.24 to 2.0.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3351">#3351</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3351">mockito/mockito#3351</a>)</li>
<li>Fixes <a href="https://redirect.github.com/mockito/mockito/issues/3237">#3237</a>: Fix NullPointerException in Only.verify [(<a href="https://redirect.github.com/mockito/mockito/issues/3349">#3349</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3349">mockito/mockito#3349</a>)</li>
<li>Bump com.gradle.enterprise from 3.17.3 to 3.17.4 [(<a href="https://redirect.github.com/mockito/mockito/issues/3348">#3348</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3348">mockito/mockito#3348</a>)</li>
<li>potential editorconfig enhancement [(<a href="https://redirect.github.com/mockito/mockito/issues/3347">#3347</a>)](<a href="https://redirect.github.com/mockito/mockito/issues/3347">mockito/mockito#3347</a>)</li>
<li>Method <code>Only.verify</code> throws <code>NullPointerException</code> [(<a href="https://redirect.github.com/mockito/mockito/issues/3237">#3237</a>)](<a href="https://redirect.github.com/mockito/mockito/issues/3237">mockito/mockito#3237</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/mockito/mockito/commit/9a7e7ea04294fd53e13936c18aca37640ca4dc5e"><code>9a7e7ea</code></a> Replace <code>dependencies.gradle</code> with <code>libs.versions.toml</code></li>
<li><a href="https://github.com/mockito/mockito/commit/1b7675c045b47637e300230624b29aad80bf64d4"><code>1b7675c</code></a> Allow links to JUnit Jupiter Javadoc</li>
<li><a href="https://github.com/mockito/mockito/commit/f6450a01f27d0e4e7e95a52025b000726558774d"><code>f6450a0</code></a> Bump versions.bytebuddy from 1.14.19 to 1.15.0 (<a href="https://redirect.github.com/mockito/mockito/issues/3429">#3429</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/77c31972b9bb8100765359081faed88aa35e6e08"><code>77c3197</code></a> Bump org.jetbrains.kotlin:kotlin-stdlib from 2.0.10 to 2.0.20 (<a href="https://redirect.github.com/mockito/mockito/issues/3427">#3427</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/0e5962428b64eadcd5ddcc89848fa6f1345454ec"><code>0e59624</code></a> Bump org.junit.platform:junit-platform-launcher from 1.10.3 to 1.11.0 (<a href="https://redirect.github.com/mockito/mockito/issues/3425">#3425</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/40925b6f93f81c598112636533fc469e85476edb"><code>40925b6</code></a> Bump com.gradle.enterprise from 3.17.6 to 3.18 (<a href="https://redirect.github.com/mockito/mockito/issues/3423">#3423</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/84f605d5d6da079dabd6ca7e29926c03f0dad45c"><code>84f605d</code></a> Fix a typo in InjectMocks (<a href="https://redirect.github.com/mockito/mockito/issues/3422">#3422</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/87e4a4fa85c84cbd09420c2c8e73bab3627708a7"><code>87e4a4f</code></a> Bump versions.bytebuddy from 1.14.18 to 1.14.19 (<a href="https://redirect.github.com/mockito/mockito/issues/3417">#3417</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/819cc6f6d867fe4aec06178e68b5faca16101e9c"><code>819cc6f</code></a> Bump androidx.test:runner from 1.6.1 to 1.6.2 (<a href="https://redirect.github.com/mockito/mockito/issues/3415">#3415</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/90df798c9623ef0c010c86319ddfeb5be64fe5f3"><code>90df798</code></a> Bump versions.junitJupiter from 5.10.3 to 5.11.0 (<a href="https://redirect.github.com/mockito/mockito/issues/3413">#3413</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/mockito/mockito/compare/v5.12.0...v5.13.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.mockito:mockito-junit-jupiter&package-manager=maven&previous-version=5.12.0&new-version=5.13.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 577f23e6a71..49e5348ef5a 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -279,7 +279,7 @@ under the License.
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-junit-jupiter</artifactId>
-      <version>5.12.0</version>
+      <version>5.13.0</version>
       <scope>test</scope>
     </dependency>
     <dependency>

From 540b2ce393c24373fd35f649eecfbb4cd336e037 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:02:30 +0900
Subject: [PATCH 115/157] MINOR: [Java] Bump com.github.luben:zstd-jni from
 1.5.6-4 to 1.5.6-5 in /java (#43921)

Bumps [com.github.luben:zstd-jni](https://github.com/luben/zstd-jni) from 1.5.6-4 to 1.5.6-5.
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/luben/zstd-jni/commit/ac14b057018be44a4b186d3682d4aae207928262"><code>ac14b05</code></a> Update the sbt syntax for the AIX target</li>
<li><a href="https://github.com/luben/zstd-jni/commit/1adcc4993395a519ed02cc87b7b255eceb63580c"><code>1adcc49</code></a> Try to bring back the AIX build</li>
<li><a href="https://github.com/luben/zstd-jni/commit/4e981883af0ac5c45bc6ca3b7479b1ac4c0a7715"><code>4e98188</code></a> v1.5.6-5</li>
<li><a href="https://github.com/luben/zstd-jni/commit/ee88b906af4f197609744bc5e98c7b35034f8bef"><code>ee88b90</code></a> Don't define <code>Automatic-Module-Name</code> in the Manifest</li>
<li>See full diff in <a href="https://github.com/luben/zstd-jni/compare/v1.5.6-4...v1.5.6-5">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.github.luben:zstd-jni&package-manager=maven&previous-version=1.5.6-4&new-version=1.5.6-5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/compression/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/compression/pom.xml b/java/compression/pom.xml
index 46ed8796423..f0d8e92c9a4 100644
--- a/java/compression/pom.xml
+++ b/java/compression/pom.xml
@@ -55,7 +55,7 @@ under the License.
     <dependency>
       <groupId>com.github.luben</groupId>
       <artifactId>zstd-jni</artifactId>
-      <version>1.5.6-4</version>
+      <version>1.5.6-5</version>
     </dependency>
   </dependencies>
 </project>

From 41e1118f083f21ad2677c182ceb8629e861e8396 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:16:01 +0900
Subject: [PATCH 116/157] MINOR: [Java] Bump org.apache.orc:orc-core from 1.9.2
 to 1.9.4 in /java (#43918)

Bumps org.apache.orc:orc-core from 1.9.2 to 1.9.4.

<details>
<summary>Most Recent Ignore Conditions Applied to This Pull Request</summary>

| Dependency Name | Ignore Conditions |
| --- | --- |
| org.apache.orc:orc-core | [>= 2.a, < 3] |
</details>

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.orc:orc-core&package-manager=maven&previous-version=1.9.2&new-version=1.9.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/adapter/orc/pom.xml | 2 +-
 java/dataset/pom.xml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml
index ec8ddbbb780..d9cd2bb21a5 100644
--- a/java/adapter/orc/pom.xml
+++ b/java/adapter/orc/pom.xml
@@ -61,7 +61,7 @@ under the License.
     <dependency>
       <groupId>org.apache.orc</groupId>
       <artifactId>orc-core</artifactId>
-      <version>1.9.2</version>
+      <version>1.9.4</version>
       <scope>test</scope>
       <exclusions>
         <exclusion>
diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml
index f3384fabbed..7e649e3824b 100644
--- a/java/dataset/pom.xml
+++ b/java/dataset/pom.xml
@@ -130,7 +130,7 @@ under the License.
     <dependency>
       <groupId>org.apache.orc</groupId>
       <artifactId>orc-core</artifactId>
-      <version>1.9.2</version>
+      <version>1.9.4</version>
       <scope>test</scope>
       <exclusions>
         <exclusion>

From 99bc23d901f14a5a5146defc030db995b6d46d63 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:30:43 +0900
Subject: [PATCH 117/157] MINOR: [Java] Bump parquet.version from 1.14.1 to
 1.14.2 in /java (#43920)

Bumps `parquet.version` from 1.14.1 to 1.14.2.
Updates `org.apache.parquet:parquet-avro` from 1.14.1 to 1.14.2
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/apache/parquet-mr/releases">org.apache.parquet:parquet-avro's releases</a>.</em></p>
<blockquote>
<h2>Apache Parquet Java 1.14.2</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
<h2>Apache Parquet Java 1.14.2 RC2</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
<h2>Apache Parquet Java 1.14.2 RC1</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/apache/parquet-java/blob/master/CHANGES.md">org.apache.parquet:parquet-avro's changelog</a>.</em></p>
<blockquote>

<h1>Parquet</h1>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/apache/parquet-java/commit/e7937382e7894f4780c90eb6f896c163cad4cd93"><code>e793738</code></a> [maven-release-plugin] prepare release apache-parquet-1.14.2rc2</li>
<li><a href="https://github.com/apache/parquet-java/commit/d04986ffbd2bc974d07c0db20afd6d2467235cbf"><code>d04986f</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter (<a href="https://redirect.github.com/apache/parquet-mr/issues/2993">#2993</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/7204a116bb4bc2fa5727e47253f75b67e600c7cb"><code>7204a11</code></a> [maven-release-plugin] prepare for next development iteration</li>
<li><a href="https://github.com/apache/parquet-java/commit/7a679f1fefb3c6a12602a33ba405264e4e4e3c40"><code>7a679f1</code></a> [maven-release-plugin] prepare release apache-parquet-1.14.2-rc1</li>
<li><a href="https://github.com/apache/parquet-java/commit/c88a3f8ab0dd2f4041b6249c807f43ed6e6d052a"><code>c88a3f8</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2948">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile...</li>
<li><a href="https://github.com/apache/parquet-java/commit/af4307b25349d78a4f401194f31786e1c4929b3f"><code>af4307b</code></a> PARQUET-1126: Write unencrypted Parquet files without Hadoop (<a href="https://redirect.github.com/apache/parquet-mr/issues/1376">#1376</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/0f3a615acb06dd8ab201f37e485aaac619e467ba"><code>0f3a615</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2956">GH-2956</a>: Use avro SchemaBuilder API to convert record (<a href="https://redirect.github.com/apache/parquet-mr/issues/2957">#2957</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/ca572cbad6ceb9fe303c057cae447c6fd9586f67"><code>ca572cb</code></a> Minor: <code>PARQUET-2472</code> is not on the branch (<a href="https://redirect.github.com/apache/parquet-mr/issues/2966">#2966</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/05f2e39cf3add09501c1534328db2452370a582c"><code>05f2e39</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2935">GH-2935</a>: Avoid double close of ParquetFileWriter (<a href="https://redirect.github.com/apache/parquet-mr/issues/2951">#2951</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/4241df31bfe4f4c90fd0c1907b96109ab16fa5e9"><code>4241df3</code></a> PARQUET-2472: Close in finally block in ParquetFileWriter#end (<a href="https://redirect.github.com/apache/parquet-mr/issues/1350">#1350</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/apache/parquet-mr/compare/apache-parquet-1.14.1...apache-parquet-1.14.2">compare view</a></li>
</ul>
</details>
<br />

Updates `org.apache.parquet:parquet-hadoop` from 1.14.1 to 1.14.2
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/apache/parquet-mr/releases">org.apache.parquet:parquet-hadoop's releases</a>.</em></p>
<blockquote>
<h2>Apache Parquet Java 1.14.2</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
<h2>Apache Parquet Java 1.14.2 RC2</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
<h2>Apache Parquet Java 1.14.2 RC1</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/apache/parquet-java/blob/master/CHANGES.md">org.apache.parquet:parquet-hadoop's changelog</a>.</em></p>
<blockquote>

<h1>Parquet</h1>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/apache/parquet-java/commit/e7937382e7894f4780c90eb6f896c163cad4cd93"><code>e793738</code></a> [maven-release-plugin] prepare release apache-parquet-1.14.2rc2</li>
<li><a href="https://github.com/apache/parquet-java/commit/d04986ffbd2bc974d07c0db20afd6d2467235cbf"><code>d04986f</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter (<a href="https://redirect.github.com/apache/parquet-mr/issues/2993">#2993</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/7204a116bb4bc2fa5727e47253f75b67e600c7cb"><code>7204a11</code></a> [maven-release-plugin] prepare for next development iteration</li>
<li><a href="https://github.com/apache/parquet-java/commit/7a679f1fefb3c6a12602a33ba405264e4e4e3c40"><code>7a679f1</code></a> [maven-release-plugin] prepare release apache-parquet-1.14.2-rc1</li>
<li><a href="https://github.com/apache/parquet-java/commit/c88a3f8ab0dd2f4041b6249c807f43ed6e6d052a"><code>c88a3f8</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2948">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile...</li>
<li><a href="https://github.com/apache/parquet-java/commit/af4307b25349d78a4f401194f31786e1c4929b3f"><code>af4307b</code></a> PARQUET-1126: Write unencrypted Parquet files without Hadoop (<a href="https://redirect.github.com/apache/parquet-mr/issues/1376">#1376</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/0f3a615acb06dd8ab201f37e485aaac619e467ba"><code>0f3a615</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2956">GH-2956</a>: Use avro SchemaBuilder API to convert record (<a href="https://redirect.github.com/apache/parquet-mr/issues/2957">#2957</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/ca572cbad6ceb9fe303c057cae447c6fd9586f67"><code>ca572cb</code></a> Minor: <code>PARQUET-2472</code> is not on the branch (<a href="https://redirect.github.com/apache/parquet-mr/issues/2966">#2966</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/05f2e39cf3add09501c1534328db2452370a582c"><code>05f2e39</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2935">GH-2935</a>: Avoid double close of ParquetFileWriter (<a href="https://redirect.github.com/apache/parquet-mr/issues/2951">#2951</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/4241df31bfe4f4c90fd0c1907b96109ab16fa5e9"><code>4241df3</code></a> PARQUET-2472: Close in finally block in ParquetFileWriter#end (<a href="https://redirect.github.com/apache/parquet-mr/issues/1350">#1350</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/apache/parquet-mr/compare/apache-parquet-1.14.1...apache-parquet-1.14.2">compare view</a></li>
</ul>
</details>
<br />

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/dataset/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml
index 7e649e3824b..a19e934f0de 100644
--- a/java/dataset/pom.xml
+++ b/java/dataset/pom.xml
@@ -32,7 +32,7 @@ under the License.
 
   <properties>
     <arrow.cpp.build.dir>../../../cpp/release-build/</arrow.cpp.build.dir>
-    <parquet.version>1.14.1</parquet.version>
+    <parquet.version>1.14.2</parquet.version>
     <avro.version>1.12.0</avro.version>
   </properties>
 

From db9435f324d816c7ed7e0a18c9806ef9f51873a3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:31:00 +0900
Subject: [PATCH 118/157] MINOR: [Java] Bump error_prone_core.version from
 2.30.0 to 2.31.0 in /java (#43923)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps `error_prone_core.version` from 2.30.0 to 2.31.0.
Updates `com.google.errorprone:error_prone_annotations` from 2.30.0 to 2.31.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/google/error-prone/releases">com.google.errorprone:error_prone_annotations's releases</a>.</em></p>
<blockquote>
<h2>Error Prone 2.31.0</h2>
<p>This is the last planned minor release of Error Prone that will support running on JDK 11, see <a href="https://redirect.github.com/google/error-prone/issues/3803">#3803</a>. Using Error Prone to compile code that is deployed to earlier versions will continue to be fully supported, but will require using JDK 17 or newer for compilation and setting <code>--release</code> or <code>-source</code>/<code>-target</code>/<code>-bootclasspath</code>.</p>
<p>Changes:</p>
<ul>
<li>Introduce <a href="https://github.com/google/error-prone/blob/2656f48902f6723f3147caa117372309dbc6c15f/type_annotations/src/main/java/com/google/errorprone/annotations/ThreadSafeTypeParameter.java"><code>@ ThreadSafeTypeParameter</code></a> with enforcement by <a href="https://errorprone.info/bugpattern/ThreadSafe">ThreadSafe</a></li>
<li>Improved support for latest JDK 24 EA builds</li>
<li>Error Prone is now distributed as a Multi-Release jar (<a href="https://redirect.github.com/google/error-prone/issues/3756">#3756</a>)</li>
</ul>
<p>New checks:</p>
<ul>
<li><a href="https://errorprone.info/bugpattern/AutoValueBoxedValues"><code>AutoValueBoxedValues</code></a>: AutoValue instances should not usually contain boxed types that are not Nullable. We recommend removing the unnecessary boxing.</li>
</ul>
<p>Full changelog: <a href="https://github.com/google/error-prone/compare/v2.30.0...v2.31.0">https://github.com/google/error-prone/compare/v2.30.0...v2.31.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/google/error-prone/commit/4294aac27cb0a5ec536fdfdbe0ec5227ac90c1a2"><code>4294aac</code></a> Release Error Prone 2.31.0</li>
<li><a href="https://github.com/google/error-prone/commit/5bf91fb051bce74517456a35e798c44c331d7da2"><code>5bf91fb</code></a> Replace <code>{@ link ThreadSafeTypeParameter}</code> with <code>{@ code ThreadSafeTypeParameter}</code></li>
<li><a href="https://github.com/google/error-prone/commit/a5a718974dd7d325025ea14c1492f113490d5cf8"><code>a5a7189</code></a> Replace <code>ComparisonChain</code> with a <code>Comparator</code> chain.</li>
<li><a href="https://github.com/google/error-prone/commit/7e9a10089b731fcff39d711aab25bc2b8b8d0c5a"><code>7e9a100</code></a> Make ThreadSafeTypeParameter useful in the open-source version of ErrorProne.</li>
<li><a href="https://github.com/google/error-prone/commit/b4cebef79651ae33277459240fc74d53e61ef3a9"><code>b4cebef</code></a> Fix typo noted by <a href="https://github.com/Stephan202"><code>@​Stephan202</code></a>.</li>
<li><a href="https://github.com/google/error-prone/commit/354104ec807269d79848d9d84b448f5e7e8e4315"><code>354104e</code></a> Remove <code>ThreadSafe.TypeParameter</code> now that it's been replaced by `ThreadSafeT...</li>
<li><a href="https://github.com/google/error-prone/commit/7542d36993acb6ac6c219c30e6bbac3ab8d0b793"><code>7542d36</code></a> Don't fire <code>CanIgnoreReturnValueSuggester</code> for simple <code>return param;</code> impleme...</li>
<li><a href="https://github.com/google/error-prone/commit/0a5a5b8bca44854904ac13b704f761a8c2a1277f"><code>0a5a5b8</code></a> Migrate <code>CollectionIncompatibleType</code> from the deprecated <code>withSignature</code> to `...</li>
<li><a href="https://github.com/google/error-prone/commit/78218f298883071c44f91fea30d8c2916f2da6df"><code>78218f2</code></a> Write more about <code>withSignature</code>.</li>
<li><a href="https://github.com/google/error-prone/commit/90d939069d5b59cc404da5ac48b25509b2ebef40"><code>90d9390</code></a> Mark some Kotlin ranges as Immutable.</li>
<li>Additional commits viewable in <a href="https://github.com/google/error-prone/compare/v2.30.0...v2.31.0">compare view</a></li>
</ul>
</details>
<br />

Updates `com.google.errorprone:error_prone_core` from 2.30.0 to 2.31.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/google/error-prone/releases">com.google.errorprone:error_prone_core's releases</a>.</em></p>
<blockquote>
<h2>Error Prone 2.31.0</h2>
<p>This is the last planned minor release of Error Prone that will support running on JDK 11, see <a href="https://redirect.github.com/google/error-prone/issues/3803">#3803</a>. Using Error Prone to compile code that is deployed to earlier versions will continue to be fully supported, but will require using JDK 17 or newer for compilation and setting <code>--release</code> or <code>-source</code>/<code>-target</code>/<code>-bootclasspath</code>.</p>
<p>Changes:</p>
<ul>
<li>Introduce <a href="https://github.com/google/error-prone/blob/2656f48902f6723f3147caa117372309dbc6c15f/type_annotations/src/main/java/com/google/errorprone/annotations/ThreadSafeTypeParameter.java"><code>@ ThreadSafeTypeParameter</code></a> with enforcement by <a href="https://errorprone.info/bugpattern/ThreadSafe">ThreadSafe</a></li>
<li>Improved support for latest JDK 24 EA builds</li>
<li>Error Prone is now distributed as a Multi-Release jar (<a href="https://redirect.github.com/google/error-prone/issues/3756">#3756</a>)</li>
</ul>
<p>New checks:</p>
<ul>
<li><a href="https://errorprone.info/bugpattern/AutoValueBoxedValues"><code>AutoValueBoxedValues</code></a>: AutoValue instances should not usually contain boxed types that are not Nullable. We recommend removing the unnecessary boxing.</li>
</ul>
<p>Full changelog: <a href="https://github.com/google/error-prone/compare/v2.30.0...v2.31.0">https://github.com/google/error-prone/compare/v2.30.0...v2.31.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/google/error-prone/commit/4294aac27cb0a5ec536fdfdbe0ec5227ac90c1a2"><code>4294aac</code></a> Release Error Prone 2.31.0</li>
<li><a href="https://github.com/google/error-prone/commit/5bf91fb051bce74517456a35e798c44c331d7da2"><code>5bf91fb</code></a> Replace <code>{@ link ThreadSafeTypeParameter}</code> with <code>{@ code ThreadSafeTypeParameter}</code></li>
<li><a href="https://github.com/google/error-prone/commit/a5a718974dd7d325025ea14c1492f113490d5cf8"><code>a5a7189</code></a> Replace <code>ComparisonChain</code> with a <code>Comparator</code> chain.</li>
<li><a href="https://github.com/google/error-prone/commit/7e9a10089b731fcff39d711aab25bc2b8b8d0c5a"><code>7e9a100</code></a> Make ThreadSafeTypeParameter useful in the open-source version of ErrorProne.</li>
<li><a href="https://github.com/google/error-prone/commit/b4cebef79651ae33277459240fc74d53e61ef3a9"><code>b4cebef</code></a> Fix typo noted by <a href="https://github.com/Stephan202"><code>@​Stephan202</code></a>.</li>
<li><a href="https://github.com/google/error-prone/commit/354104ec807269d79848d9d84b448f5e7e8e4315"><code>354104e</code></a> Remove <code>ThreadSafe.TypeParameter</code> now that it's been replaced by `ThreadSafeT...</li>
<li><a href="https://github.com/google/error-prone/commit/7542d36993acb6ac6c219c30e6bbac3ab8d0b793"><code>7542d36</code></a> Don't fire <code>CanIgnoreReturnValueSuggester</code> for simple <code>return param;</code> impleme...</li>
<li><a href="https://github.com/google/error-prone/commit/0a5a5b8bca44854904ac13b704f761a8c2a1277f"><code>0a5a5b8</code></a> Migrate <code>CollectionIncompatibleType</code> from the deprecated <code>withSignature</code> to `...</li>
<li><a href="https://github.com/google/error-prone/commit/78218f298883071c44f91fea30d8c2916f2da6df"><code>78218f2</code></a> Write more about <code>withSignature</code>.</li>
<li><a href="https://github.com/google/error-prone/commit/90d939069d5b59cc404da5ac48b25509b2ebef40"><code>90d9390</code></a> Mark some Kotlin ranges as Immutable.</li>
<li>Additional commits viewable in <a href="https://github.com/google/error-prone/compare/v2.30.0...v2.31.0">compare view</a></li>
</ul>
</details>
<br />

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 49e5348ef5a..81e652f462e 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -107,7 +107,7 @@ under the License.
     <forkCount>2</forkCount>
     <checkstyle.version>10.17.0</checkstyle.version>
     <checkstyle.failOnViolation>true</checkstyle.failOnViolation>
-    <error_prone_core.version>2.30.0</error_prone_core.version>
+    <error_prone_core.version>2.31.0</error_prone_core.version>
     <mockito.core.version>5.11.0</mockito.core.version>
     <mockito.inline.version>5.2.0</mockito.inline.version>
     <checker.framework.version>3.46.0</checker.framework.version>

From b525e845e568cdee9de5b538bb5abeb5d49bf9bb Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Tue, 3 Sep 2024 15:34:42 +0800
Subject: [PATCH 119/157] Updated parquet.thrift and re-generated cpp sources

---
 cpp/src/generated/parquet_types.cpp |  38 ++++++----
 cpp/src/generated/parquet_types.h   |  97 +++++++++++++++++++-------
 cpp/src/generated/parquet_types.tcc |   7 +-
 cpp/src/parquet/parquet.thrift      | 103 +++++++++++++++++++++-------
 4 files changed, 176 insertions(+), 69 deletions(-)

diff --git a/cpp/src/generated/parquet_types.cpp b/cpp/src/generated/parquet_types.cpp
index f580c7905e8..c1102440a10 100644
--- a/cpp/src/generated/parquet_types.cpp
+++ b/cpp/src/generated/parquet_types.cpp
@@ -405,13 +405,18 @@ int _kGeometryEncodingValues[] = {
   /**
    * Allowed for physical type: BYTE_ARRAY.
    *
-   * Well-known binary (WKB) representations of geometries. It supports 2D or
-   * 3D geometries of the standard geometry types (Point, LineString, Polygon,
-   * MultiPoint, MultiLineString, MultiPolygon, and GeometryCollection). This
-   * is the preferred option for maximum portability.
+   * Well-known binary (WKB) representations of geometries.
    *
-   * This encoding enables GeometryStatistics to be set in the column chunk
-   * and page index.
+   * To be clear, we follow the same rule of WKB and coordinate axis order from
+   * GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM and the
+   * standard geometry types (Point, LineString, Polygon, MultiPoint,
+   * MultiLineString, MultiPolygon, and GeometryCollection).
+   *
+   * This is the preferred encoding for maximum portability. It also supports
+   * GeometryStatistics to be set in the column chunk and page index.
+   *
+   * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92
+   * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155
    */
   GeometryEncoding::WKB
 };
@@ -419,13 +424,18 @@ const char* _kGeometryEncodingNames[] = {
   /**
    * Allowed for physical type: BYTE_ARRAY.
    *
-   * Well-known binary (WKB) representations of geometries. It supports 2D or
-   * 3D geometries of the standard geometry types (Point, LineString, Polygon,
-   * MultiPoint, MultiLineString, MultiPolygon, and GeometryCollection). This
-   * is the preferred option for maximum portability.
+   * Well-known binary (WKB) representations of geometries.
+   *
+   * To be clear, we follow the same rule of WKB and coordinate axis order from
+   * GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM and the
+   * standard geometry types (Point, LineString, Polygon, MultiPoint,
+   * MultiLineString, MultiPolygon, and GeometryCollection).
    *
-   * This encoding enables GeometryStatistics to be set in the column chunk
-   * and page index.
+   * This is the preferred encoding for maximum portability. It also supports
+   * GeometryStatistics to be set in the column chunk and page index.
+   *
+   * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92
+   * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155
    */
   "WKB"
 };
@@ -4568,5 +4578,5 @@ void FileCryptoMetaData::printTo(std::ostream& out) const {
   out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "<null>"));
   out << ")";
 }
-}
-} // namespace
+
+}} // namespace
diff --git a/cpp/src/generated/parquet_types.h b/cpp/src/generated/parquet_types.h
index 0a857c4c6e8..b190f9d2199 100644
--- a/cpp/src/generated/parquet_types.h
+++ b/cpp/src/generated/parquet_types.h
@@ -20,7 +20,6 @@
 
 #include "parquet/windows_compatibility.h"
 
-
 namespace parquet { namespace format {
 
 /**
@@ -216,7 +215,7 @@ std::string to_string(const FieldRepetitionType::type& val);
 /**
  * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge
  * between points represent a straight cartesian line or the shortest line on
- * the sphere. Please note that it only applies to polygons.
+ * the sphere. It applies to all non-point geometry objects.
  */
 struct Edges {
   enum type {
@@ -239,13 +238,18 @@ struct GeometryEncoding {
     /**
      * Allowed for physical type: BYTE_ARRAY.
      *
-     * Well-known binary (WKB) representations of geometries. It supports 2D or
-     * 3D geometries of the standard geometry types (Point, LineString, Polygon,
-     * MultiPoint, MultiLineString, MultiPolygon, and GeometryCollection). This
-     * is the preferred option for maximum portability.
+     * Well-known binary (WKB) representations of geometries.
+     *
+     * To be clear, we follow the same rule of WKB and coordinate axis order from
+     * GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM and the
+     * standard geometry types (Point, LineString, Polygon, MultiPoint,
+     * MultiLineString, MultiPolygon, and GeometryCollection).
      *
-     * This encoding enables GeometryStatistics to be set in the column chunk
-     * and page index.
+     * This is the preferred encoding for maximum portability. It also supports
+     * GeometryStatistics to be set in the column chunk and page index.
+     *
+     * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92
+     * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155
      */
     WKB = 0
   };
@@ -616,9 +620,9 @@ std::ostream& operator<<(std::ostream& out, const SizeStatistics& obj);
 
 
 /**
- * A custom WKB-encoded polygon or multi-polygon to represent a covering of
+ * A custom binary-encoded polygon or multi-polygon to represent a covering of
  * geometries. For example, it may be a bounding box or an envelope of geometries
- * when a bounding box cannot be built (e.g., a geometry has spherical edges, or if
+ * when a bounding box cannot be built (e.g. a geometry has spherical edges, or if
  * an edge of geographic coordinates crosses the antimeridian). In addition, it can
  * also be used to provide vendor-agnostic coverings like S2 or H3 grids.
  */
@@ -640,10 +644,10 @@ class Covering {
    */
   std::string kind;
   /**
-   * A payload specific to kind:
-   * - WKB: well-known binary of a POLYGON that completely covers the contents.
-   *   This will be interpreted according to the same CRS and edges defined by
-   *   the logical type.
+   * A payload specific to kind. Below are the supported values:
+   * - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely
+   *   covers the contents. This will be interpreted according to the same CRS
+   *   and edges defined by the logical type.
    */
   std::string value;
 
@@ -688,6 +692,9 @@ typedef struct _BoundingBox__isset {
 /**
  * Bounding box of geometries in the representation of min/max value pair of
  * coordinates from each axis. Values of Z and M are omitted for 2D geometries.
+ * Filter pushdown on geometries are only safe for planar spatial predicate
+ * but it is recommended that the writer always generates bounding box statistics,
+ * regardless of whether the geometries are planar or spherical.
  */
 class BoundingBox {
  public:
@@ -807,7 +814,10 @@ class GeometryStatistics {
    */
   BoundingBox bbox;
   /**
-   * A list of coverings of geometries
+   * A list of coverings of geometries.
+   * Note that It is allowed to have more than one covering of the same kind and
+   * implementation is free to use any of them. It is recommended to have at most
+   * one covering for each kind.
    */
   std::vector<Covering>  coverings;
   /**
@@ -835,7 +845,7 @@ class GeometryStatistics {
    *
    * Please refer to links below for more detail:
    * [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary
-   * [2] https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91
+   * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159
    */
   std::vector<int32_t>  geometry_types;
 
@@ -1828,31 +1838,70 @@ class GeometryType {
 
   virtual ~GeometryType() noexcept;
   /**
-   * Physical type and encoding for the geometry type. Please refer to the
-   * definition of GeometryEncoding for more detail.
+   * Physical type and encoding for the geometry type.
+   * Please refer to the definition of GeometryEncoding for more detail.
    *
    * @see GeometryEncoding
    */
   GeometryEncoding::type encoding;
   /**
-   * Edges of polygon.
+   * Edges of geometry type.
+   * Please refer to the definition of Edges for more detail.
    *
    * @see Edges
    */
   Edges::type edges;
   /**
    * Coordinate Reference System, i.e. mapping of how coordinates refer to
-   * precise locations on earth.
+   * precise locations on earth. Writers are not required to set this field.
+   * Once crs is set, crs_encoding field below MUST be set together.
+   * For example, "OGC:CRS84" can be set in the form of PROJJSON as below:
+   * {
+   *     "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json",
+   *     "type": "GeographicCRS",
+   *     "name": "WGS 84 longitude-latitude",
+   *     "datum": {
+   *         "type": "GeodeticReferenceFrame",
+   *         "name": "World Geodetic System 1984",
+   *         "ellipsoid": {
+   *             "name": "WGS 84",
+   *             "semi_major_axis": 6378137,
+   *             "inverse_flattening": 298.257223563
+   *         }
+   *     },
+   *     "coordinate_system": {
+   *         "subtype": "ellipsoidal",
+   *         "axis": [
+   *         {
+   *             "name": "Geodetic longitude",
+   *             "abbreviation": "Lon",
+   *             "direction": "east",
+   *             "unit": "degree"
+   *         },
+   *         {
+   *             "name": "Geodetic latitude",
+   *             "abbreviation": "Lat",
+   *             "direction": "north",
+   *             "unit": "degree"
+   *         }
+   *         ]
+   *     },
+   *     "id": {
+   *         "authority": "OGC",
+   *         "code": "CRS84"
+   *     }
+   * }
    */
   std::string crs;
   /**
-   * Encoding used in the above crs field.
+   * Encoding used in the above crs field. It MUST be set if crs field is set.
    * Currently the only allowed value is "PROJJSON".
    */
   std::string crs_encoding;
   /**
    * Additional informative metadata.
-   * It can be used by GeoParquet to offload some of the column metadata.
+   * GeoParquet could offload its column metadata in a JSON-encoded UTF-8 string:
+   * https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L46
    */
   std::string metadata;
 
@@ -4768,9 +4817,7 @@ void swap(FileCryptoMetaData &a, FileCryptoMetaData &b);
 
 std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj);
 
-}
-} // namespace
-
+}} // namespace
 
 #include "parquet_types.tcc"
 
diff --git a/cpp/src/generated/parquet_types.tcc b/cpp/src/generated/parquet_types.tcc
index f0e5b5ea37c..33a457e4ad9 100644
--- a/cpp/src/generated/parquet_types.tcc
+++ b/cpp/src/generated/parquet_types.tcc
@@ -1619,7 +1619,7 @@ uint32_t GeometryType::read(Protocol_* iprot) {
         break;
       case 5:
         if (ftype == ::apache::thrift::protocol::T_STRING) {
-          xfer += iprot->readBinary(this->metadata);
+          xfer += iprot->readString(this->metadata);
           this->__isset.metadata = true;
         } else {
           xfer += iprot->skip(ftype);
@@ -1667,7 +1667,7 @@ uint32_t GeometryType::write(Protocol_* oprot) const {
   }
   if (this->__isset.metadata) {
     xfer += oprot->writeFieldBegin("metadata", ::apache::thrift::protocol::T_STRING, 5);
-    xfer += oprot->writeBinary(this->metadata);
+    xfer += oprot->writeString(this->metadata);
     xfer += oprot->writeFieldEnd();
   }
   xfer += oprot->writeFieldStop();
@@ -5395,7 +5395,6 @@ uint32_t FileCryptoMetaData::write(Protocol_* oprot) const {
   return xfer;
 }
 
-}
-} // namespace
+}} // namespace
 
 #endif
diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift
index 61a40882a61..82ad4ff7893 100644
--- a/cpp/src/parquet/parquet.thrift
+++ b/cpp/src/parquet/parquet.thrift
@@ -20,7 +20,8 @@
 /**
  * File format description for the parquet file format
  */
-namespace cpp parquet
+cpp_include "parquet/windows_compatibility.h"
+namespace cpp parquet.format
 namespace java org.apache.parquet.format
 
 /**
@@ -240,7 +241,7 @@ struct SizeStatistics {
 /**
  * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge
  * between points represent a straight cartesian line or the shortest line on
- * the sphere. Please note that it only applies to polygons.
+ * the sphere. It applies to all non-point geometry objects.
  */
 enum Edges {
   PLANAR = 0;
@@ -248,9 +249,9 @@ enum Edges {
 }
 
 /**
- * A custom WKB-encoded polygon or multi-polygon to represent a covering of
+ * A custom binary-encoded polygon or multi-polygon to represent a covering of
  * geometries. For example, it may be a bounding box or an envelope of geometries
- * when a bounding box cannot be built (e.g., a geometry has spherical edges, or if
+ * when a bounding box cannot be built (e.g. a geometry has spherical edges, or if
  * an edge of geographic coordinates crosses the antimeridian). In addition, it can
  * also be used to provide vendor-agnostic coverings like S2 or H3 grids.
  */
@@ -259,10 +260,11 @@ struct Covering {
    * A type of covering. Currently accepted values: "WKB".
    */
   1: required string kind;
-  /** A payload specific to kind:
-   * - WKB: well-known binary of a POLYGON that completely covers the contents.
-   *   This will be interpreted according to the same CRS and edges defined by
-   *   the logical type.
+  /**
+   * A payload specific to kind. Below are the supported values:
+   * - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely
+   *   covers the contents. This will be interpreted according to the same CRS
+   *   and edges defined by the logical type.
    */
   2: required binary value;
 }
@@ -270,6 +272,9 @@ struct Covering {
 /**
  * Bounding box of geometries in the representation of min/max value pair of
  * coordinates from each axis. Values of Z and M are omitted for 2D geometries.
+ * Filter pushdown on geometries are only safe for planar spatial predicate
+ * but it is recommended that the writer always generates bounding box statistics,
+ * regardless of whether the geometries are planar or spherical.
  */
 struct BoundingBox {
   1: required double xmin;
@@ -287,7 +292,12 @@ struct GeometryStatistics {
   /** A bounding box of geometries */
   1: optional BoundingBox bbox;
 
-  /** A list of coverings of geometries */
+  /**
+   * A list of coverings of geometries.
+   * Note that It is allowed to have more than one covering of the same kind and
+   * implementation is free to use any of them. It is recommended to have at most
+   * one covering for each kind.
+   */
   2: optional list<Covering> coverings;
 
   /**
@@ -315,7 +325,7 @@ struct GeometryStatistics {
    *
    * Please refer to links below for more detail:
    * [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary
-   * [2] https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91
+   * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159
    */
   3: optional list<i32> geometry_types;
 }
@@ -466,17 +476,20 @@ enum GeometryEncoding {
   /**
    * Allowed for physical type: BYTE_ARRAY.
    *
-   * Well-known binary (WKB) representations of geometries. It supports 2D or
-   * 3D geometries of the standard geometry types (Point, LineString, Polygon,
-   * MultiPoint, MultiLineString, MultiPolygon, and GeometryCollection). This
-   * is the preferred option for maximum portability.
+   * Well-known binary (WKB) representations of geometries.
+   *
+   * To be clear, we follow the same rule of WKB and coordinate axis order from
+   * GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM and the
+   * standard geometry types (Point, LineString, Polygon, MultiPoint,
+   * MultiLineString, MultiPolygon, and GeometryCollection).
    *
-   * This encoding enables GeometryStatistics to be set in the column chunk
-   * and page index.
+   * This is the preferred encoding for maximum portability. It also supports
+   * GeometryStatistics to be set in the column chunk and page index.
+   *
+   * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92
+   * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155
    */
   WKB = 0;
-
-  // TODO: add native encoding from GeoParquet/GeoArrow
 }
 
 /**
@@ -484,29 +497,68 @@ enum GeometryEncoding {
  */
 struct GeometryType {
   /**
-   * Physical type and encoding for the geometry type. Please refer to the
-   * definition of GeometryEncoding for more detail.
+   * Physical type and encoding for the geometry type.
+   * Please refer to the definition of GeometryEncoding for more detail.
    */
   1: required GeometryEncoding encoding;
   /**
-   * Edges of polygon.
+   * Edges of geometry type.
+   * Please refer to the definition of Edges for more detail.
    */
   2: required Edges edges;
   /**
    * Coordinate Reference System, i.e. mapping of how coordinates refer to
-   * precise locations on earth.
+   * precise locations on earth. Writers are not required to set this field.
+   * Once crs is set, crs_encoding field below MUST be set together.
+   * For example, "OGC:CRS84" can be set in the form of PROJJSON as below:
+   * {
+   *     "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json",
+   *     "type": "GeographicCRS",
+   *     "name": "WGS 84 longitude-latitude",
+   *     "datum": {
+   *         "type": "GeodeticReferenceFrame",
+   *         "name": "World Geodetic System 1984",
+   *         "ellipsoid": {
+   *             "name": "WGS 84",
+   *             "semi_major_axis": 6378137,
+   *             "inverse_flattening": 298.257223563
+   *         }
+   *     },
+   *     "coordinate_system": {
+   *         "subtype": "ellipsoidal",
+   *         "axis": [
+   *         {
+   *             "name": "Geodetic longitude",
+   *             "abbreviation": "Lon",
+   *             "direction": "east",
+   *             "unit": "degree"
+   *         },
+   *         {
+   *             "name": "Geodetic latitude",
+   *             "abbreviation": "Lat",
+   *             "direction": "north",
+   *             "unit": "degree"
+   *         }
+   *         ]
+   *     },
+   *     "id": {
+   *         "authority": "OGC",
+   *         "code": "CRS84"
+   *     }
+   * }
    */
   3: optional string crs;
   /**
-   * Encoding used in the above crs field.
+   * Encoding used in the above crs field. It MUST be set if crs field is set.
    * Currently the only allowed value is "PROJJSON".
    */
   4: optional string crs_encoding;
   /**
    * Additional informative metadata.
-   * It can be used by GeoParquet to offload some of the column metadata.
+   * GeoParquet could offload its column metadata in a JSON-encoded UTF-8 string:
+   * https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L46
    */
-  5: optional binary metadata;
+  5: optional string metadata;
 }
 
 /**
@@ -1331,4 +1383,3 @@ struct FileCryptoMetaData {
    *  and (possibly) columns **/
   2: optional binary key_metadata
 }
-

From 170c599cca72971c5db07305a73fd5d4885c1e61 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 3 Sep 2024 14:36:56 +0200
Subject: [PATCH 120/157] GH-40216: [Python][CI][Packaging] Upload nightly
 wheels to main label of scientific-python-nightly-wheels channel (#43932)

### Rationale for this change

Small follow-up on https://github.com/apache/arrow/pull/43862, correcting the `label` being used to upload the wheels. See https://github.com/apache/arrow/issues/40216#issuecomment-2325937999 for context.

* GitHub Issue: #40216

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 dev/tasks/macros.jinja | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja
index 63cb2fc6dd1..082d33b124f 100644
--- a/dev/tasks/macros.jinja
+++ b/dev/tasks/macros.jinja
@@ -189,7 +189,7 @@ env:
     shell: bash
     run: |
       python3 -m pip install git+https://github.com/Anaconda-Platform/anaconda-client.git@1.12.3
-      anaconda -t ${CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN} upload --force -u scientific-python-nightly-wheels --label dev {{ pattern }}
+      anaconda -t ${CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN} upload --force -u scientific-python-nightly-wheels --label main {{ pattern }}
     env:
       CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN: {{ '${{ secrets.CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN }}' }}
   {% endif %}

From b0786d48a58a5f95fe22db12932a3a9dffb4101f Mon Sep 17 00:00:00 2001
From: qmmk <47608571+qmmk@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:30:06 +0200
Subject: [PATCH 121/157] GH-43907: [C#][FlightRPC] Add Grpc Call Options
 support on Flight Client (#43910)

### Rationale for this change

This implementation add default grpc call options on the csharp implementation FlightClient

### What changes are included in this PR?

- FlightClient.cs with updated signature for all the methods accepting grpc call options
- FlightTest.cs update test to verify the raise of the right exception

### Are these changes tested?

Yes, tests are added in FlightTest.cs
I've tested locally with the C++ implementation.

### Are there any user-facing changes?

No is transparent for the user, following the already present documentation should be sufficient.

### References

* GitHub Issue: #43907

Authored-by: Marco Malagoli <mmalagoli@board.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../Client/FlightClient.cs                    | 69 ++++++++++---
 .../Apache.Arrow.Flight.Tests/FlightTests.cs  | 97 ++++++++++++++++++-
 2 files changed, 150 insertions(+), 16 deletions(-)

diff --git a/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs b/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs
index efb22b1948a..b89ce9da79d 100644
--- a/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs
+++ b/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+using System.Threading;
 using System.Threading.Tasks;
 using Apache.Arrow.Flight.Internal;
 using Apache.Arrow.Flight.Protocol;
@@ -34,12 +35,17 @@ public FlightClient(ChannelBase grpcChannel)
 
         public AsyncServerStreamingCall<FlightInfo> ListFlights(FlightCriteria criteria = null, Metadata headers = null)
         {
-            if(criteria == null)
+            return ListFlights(criteria, headers, null, CancellationToken.None);
+        }
+
+        public AsyncServerStreamingCall<FlightInfo> ListFlights(FlightCriteria criteria, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            if (criteria == null)
             {
                 criteria = FlightCriteria.Empty;
             }
-            
-            var response = _client.ListFlights(criteria.ToProtocol(), headers);
+
+            var response = _client.ListFlights(criteria.ToProtocol(), headers, deadline, cancellationToken);
             var convertStream = new StreamReader<Protocol.FlightInfo, FlightInfo>(response.ResponseStream, inFlight => new FlightInfo(inFlight));
 
             return new AsyncServerStreamingCall<FlightInfo>(convertStream, response.ResponseHeadersAsync, response.GetStatus, response.GetTrailers, response.Dispose);
@@ -47,7 +53,12 @@ public AsyncServerStreamingCall<FlightInfo> ListFlights(FlightCriteria criteria
 
         public AsyncServerStreamingCall<FlightActionType> ListActions(Metadata headers = null)
         {
-            var response = _client.ListActions(EmptyInstance, headers);
+            return ListActions(headers, null, CancellationToken.None);
+        }
+
+        public AsyncServerStreamingCall<FlightActionType> ListActions(Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var response = _client.ListActions(EmptyInstance, headers, deadline, cancellationToken);
             var convertStream = new StreamReader<Protocol.ActionType, FlightActionType>(response.ResponseStream, actionType => new FlightActionType(actionType));
 
             return new AsyncServerStreamingCall<FlightActionType>(convertStream, response.ResponseHeadersAsync, response.GetStatus, response.GetTrailers, response.Dispose);
@@ -55,14 +66,24 @@ public AsyncServerStreamingCall<FlightActionType> ListActions(Metadata headers =
 
         public FlightRecordBatchStreamingCall GetStream(FlightTicket ticket, Metadata headers = null)
         {
-            var stream = _client.DoGet(ticket.ToProtocol(),  headers);
+            return GetStream(ticket, headers, null, CancellationToken.None);
+        }
+
+        public FlightRecordBatchStreamingCall GetStream(FlightTicket ticket, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var stream = _client.DoGet(ticket.ToProtocol(), headers, deadline, cancellationToken);
             var responseStream = new FlightClientRecordBatchStreamReader(stream.ResponseStream);
             return new FlightRecordBatchStreamingCall(responseStream, stream.ResponseHeadersAsync, stream.GetStatus, stream.GetTrailers, stream.Dispose);
         }
 
         public AsyncUnaryCall<FlightInfo> GetInfo(FlightDescriptor flightDescriptor, Metadata headers = null)
         {
-            var flightInfoResult = _client.GetFlightInfoAsync(flightDescriptor.ToProtocol(), headers);
+            return GetInfo(flightDescriptor, headers, null, CancellationToken.None);
+        }
+
+        public AsyncUnaryCall<FlightInfo> GetInfo(FlightDescriptor flightDescriptor, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var flightInfoResult = _client.GetFlightInfoAsync(flightDescriptor.ToProtocol(), headers, deadline, cancellationToken);
 
             var flightInfo = flightInfoResult
                 .ResponseAsync
@@ -79,7 +100,12 @@ public AsyncUnaryCall<FlightInfo> GetInfo(FlightDescriptor flightDescriptor, Met
 
         public FlightRecordBatchDuplexStreamingCall StartPut(FlightDescriptor flightDescriptor, Metadata headers = null)
         {
-            var channels = _client.DoPut(headers);
+            return StartPut(flightDescriptor, headers, null, CancellationToken.None);
+        }
+
+        public FlightRecordBatchDuplexStreamingCall StartPut(FlightDescriptor flightDescriptor, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var channels = _client.DoPut(headers, deadline, cancellationToken);
             var requestStream = new FlightClientRecordBatchStreamWriter(channels.RequestStream, flightDescriptor);
             var readStream = new StreamReader<Protocol.PutResult, FlightPutResult>(channels.ResponseStream, putResult => new FlightPutResult(putResult));
             return new FlightRecordBatchDuplexStreamingCall(
@@ -93,7 +119,13 @@ public FlightRecordBatchDuplexStreamingCall StartPut(FlightDescriptor flightDesc
 
         public AsyncDuplexStreamingCall<FlightHandshakeRequest, FlightHandshakeResponse> Handshake(Metadata headers = null)
         {
-            var channel = _client.Handshake(headers);
+            return Handshake(headers, null, CancellationToken.None);
+
+        }
+
+        public AsyncDuplexStreamingCall<FlightHandshakeRequest, FlightHandshakeResponse> Handshake(Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var channel = _client.Handshake(headers, deadline, cancellationToken);
             var readStream = new StreamReader<HandshakeResponse, FlightHandshakeResponse>(channel.ResponseStream, response => new FlightHandshakeResponse(response));
             var writeStream = new FlightHandshakeStreamWriterAdapter(channel.RequestStream);
             var call = new AsyncDuplexStreamingCall<FlightHandshakeRequest, FlightHandshakeResponse>(
@@ -109,7 +141,12 @@ public AsyncDuplexStreamingCall<FlightHandshakeRequest, FlightHandshakeResponse>
 
         public FlightRecordBatchExchangeCall DoExchange(FlightDescriptor flightDescriptor, Metadata headers = null)
         {
-            var channel = _client.DoExchange(headers);
+            return DoExchange(flightDescriptor, headers, null, CancellationToken.None);
+        }
+
+        public FlightRecordBatchExchangeCall DoExchange(FlightDescriptor flightDescriptor, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var channel = _client.DoExchange(headers, deadline, cancellationToken);
             var requestStream = new FlightClientRecordBatchStreamWriter(channel.RequestStream, flightDescriptor);
             var responseStream = new FlightClientRecordBatchStreamReader(channel.ResponseStream);
             var call = new FlightRecordBatchExchangeCall(
@@ -125,14 +162,24 @@ public FlightRecordBatchExchangeCall DoExchange(FlightDescriptor flightDescripto
 
         public AsyncServerStreamingCall<FlightResult> DoAction(FlightAction action, Metadata headers = null)
         {
-            var stream = _client.DoAction(action.ToProtocol(), headers);
+            return DoAction(action, headers, null, CancellationToken.None);
+        }
+
+        public AsyncServerStreamingCall<FlightResult> DoAction(FlightAction action, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var stream = _client.DoAction(action.ToProtocol(), headers, deadline, cancellationToken);
             var streamReader = new StreamReader<Protocol.Result, FlightResult>(stream.ResponseStream, result => new FlightResult(result));
             return new AsyncServerStreamingCall<FlightResult>(streamReader, stream.ResponseHeadersAsync, stream.GetStatus, stream.GetTrailers, stream.Dispose);
         }
 
         public AsyncUnaryCall<Schema> GetSchema(FlightDescriptor flightDescriptor, Metadata headers = null)
         {
-            var schemaResult = _client.GetSchemaAsync(flightDescriptor.ToProtocol(), headers);
+            return GetSchema(flightDescriptor, headers, null, CancellationToken.None);
+        }
+
+        public AsyncUnaryCall<Schema> GetSchema(FlightDescriptor flightDescriptor, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var schemaResult = _client.GetSchemaAsync(flightDescriptor.ToProtocol(), headers, deadline, cancellationToken);
 
             var schema = schemaResult
                 .ResponseAsync
diff --git a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
index aac4e420924..8bf6e1120c6 100644
--- a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
+++ b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
@@ -16,12 +16,15 @@
 using System;
 using System.Collections.Generic;
 using System.Linq;
+using System.Threading;
 using System.Threading.Tasks;
 using Apache.Arrow.Flight.Client;
 using Apache.Arrow.Flight.TestWeb;
 using Apache.Arrow.Tests;
 using Google.Protobuf;
+using Grpc.Core;
 using Grpc.Core.Utils;
+using Python.Runtime;
 using Xunit;
 
 namespace Apache.Arrow.Flight.Tests
@@ -70,7 +73,7 @@ private FlightInfo GivenStoreBatches(FlightDescriptor flightDescriptor, params R
 
             var flightHolder = new FlightHolder(flightDescriptor, initialBatch.RecordBatch.Schema, _testWebFactory.GetAddress());
 
-            foreach(var batch in batches)
+            foreach (var batch in batches)
             {
                 flightHolder.AddBatch(batch);
             }
@@ -187,8 +190,8 @@ public async Task TestGetFlightMetadata()
 
             var getStream = _flightClient.GetStream(endpoint.Ticket);
 
-            List<ByteString> actualMetadata = new List<ByteString>(); 
-            while(await getStream.ResponseStream.MoveNext(default))
+            List<ByteString> actualMetadata = new List<ByteString>();
+            while (await getStream.ResponseStream.MoveNext(default))
             {
                 actualMetadata.AddRange(getStream.ResponseStream.ApplicationMetadata);
             }
@@ -277,7 +280,7 @@ public async Task TestListFlights()
 
             var actualFlights = await listFlightStream.ResponseStream.ToListAsync();
 
-            for(int i = 0; i < expectedFlightInfo.Count; i++)
+            for (int i = 0; i < expectedFlightInfo.Count; i++)
             {
                 FlightInfoComparer.Compare(expectedFlightInfo[i], actualFlights[i]);
             }
@@ -386,7 +389,7 @@ public async Task TestGetBatchesWithAsyncEnumerable()
 
 
             List<RecordBatch> resultList = new List<RecordBatch>();
-            await foreach(var recordBatch in getStream.ResponseStream)
+            await foreach (var recordBatch in getStream.ResponseStream)
             {
                 resultList.Add(recordBatch);
             }
@@ -415,5 +418,89 @@ public async Task EnsureTheSerializedBatchContainsTheProperTotalRecordsAndTotalB
             Assert.Equal(expectedBatch.Length, result.TotalRecords);
             Assert.Equal(expectedTotalBytes, result.TotalBytes);
         }
+
+        [Fact]
+        public async Task EnsureCallRaisesDeadlineExceeded()
+        {
+            var flightDescriptor = FlightDescriptor.CreatePathDescriptor("raise_deadline");
+            var deadline = DateTime.UtcNow;
+            var batch = CreateTestBatch(0, 100);
+
+            RpcException exception = null;
+
+            var asyncServerStreamingCallFlights = _flightClient.ListFlights(null, null, deadline);
+            Assert.Equal(StatusCode.DeadlineExceeded, asyncServerStreamingCallFlights.GetStatus().StatusCode);
+
+            var asyncServerStreamingCallActions = _flightClient.ListActions(null, deadline);
+            Assert.Equal(StatusCode.DeadlineExceeded, asyncServerStreamingCallFlights.GetStatus().StatusCode);
+
+            GivenStoreBatches(flightDescriptor, new RecordBatchWithMetadata(batch));
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await _flightClient.GetInfo(flightDescriptor, null, deadline));
+            Assert.Equal(StatusCode.DeadlineExceeded, exception.StatusCode);
+
+            var flightInfo = await _flightClient.GetInfo(flightDescriptor);
+            var endpoint = flightInfo.Endpoints.FirstOrDefault();
+            var getStream = _flightClient.GetStream(endpoint.Ticket, null, deadline);
+            Assert.Equal(StatusCode.DeadlineExceeded, getStream.GetStatus().StatusCode);
+
+            var duplexStreamingCall = _flightClient.DoExchange(flightDescriptor, null, deadline);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await duplexStreamingCall.RequestStream.WriteAsync(batch));
+            Assert.Equal(StatusCode.DeadlineExceeded, exception.StatusCode);
+
+            var putStream = _flightClient.StartPut(flightDescriptor, null, deadline);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await putStream.RequestStream.WriteAsync(batch));
+            Assert.Equal(StatusCode.DeadlineExceeded, exception.StatusCode);
+
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await _flightClient.GetSchema(flightDescriptor, null, deadline));
+            Assert.Equal(StatusCode.DeadlineExceeded, exception.StatusCode);
+
+            var handshakeStreamingCall = _flightClient.Handshake(null, deadline);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await handshakeStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.Empty)));
+            Assert.Equal(StatusCode.DeadlineExceeded, exception.StatusCode);
+        }
+
+        [Fact]
+        public async Task EnsureCallRaisesRequestCancelled()
+        {
+            var cts = new CancellationTokenSource();
+            cts.CancelAfter(1);
+            
+            var batch = CreateTestBatch(0, 100);
+            var metadata = new Metadata();
+            var flightDescriptor = FlightDescriptor.CreatePathDescriptor("raise_cancelled");
+            await Task.Delay(5);
+            RpcException exception = null;
+
+            var asyncServerStreamingCallFlights = _flightClient.ListFlights(null, null, null, cts.Token);
+            Assert.Equal(StatusCode.Cancelled, asyncServerStreamingCallFlights.GetStatus().StatusCode);
+
+            var asyncServerStreamingCallActions = _flightClient.ListActions(null, null, cts.Token);
+            Assert.Equal(StatusCode.Cancelled, asyncServerStreamingCallFlights.GetStatus().StatusCode);
+
+            GivenStoreBatches(flightDescriptor, new RecordBatchWithMetadata(batch));
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await _flightClient.GetInfo(flightDescriptor, null, null, cts.Token));
+            Assert.Equal(StatusCode.Cancelled, exception.StatusCode);
+
+            var flightInfo = await _flightClient.GetInfo(flightDescriptor);
+            var endpoint = flightInfo.Endpoints.FirstOrDefault();
+            var getStream = _flightClient.GetStream(endpoint.Ticket, null, null, cts.Token);
+            Assert.Equal(StatusCode.Cancelled, getStream.GetStatus().StatusCode);
+
+            var duplexStreamingCall = _flightClient.DoExchange(flightDescriptor, null, null, cts.Token);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await duplexStreamingCall.RequestStream.WriteAsync(batch));
+            Assert.Equal(StatusCode.Cancelled, exception.StatusCode);
+
+            var putStream = _flightClient.StartPut(flightDescriptor, null, null, cts.Token);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await putStream.RequestStream.WriteAsync(batch));
+            Assert.Equal(StatusCode.Cancelled, exception.StatusCode);
+
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await _flightClient.GetSchema(flightDescriptor, null, null, cts.Token));
+            Assert.Equal(StatusCode.Cancelled, exception.StatusCode);
+
+            var handshakeStreamingCall = _flightClient.Handshake(null, null, cts.Token);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await handshakeStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.Empty)));
+            Assert.Equal(StatusCode.Cancelled, exception.StatusCode);
+
+        }
     }
 }

From 57cc0b92e4c7fd6d80cbb06c18853e17b444e4ce Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Tue, 3 Sep 2024 10:54:15 -0300
Subject: [PATCH 122/157] GH-43927: [C++] Make ChunkResolver::ResolveMany
 output a list of ChunkLocations (#43928)

### Rationale for this change

Better cache locality and it's simpler. Easier to use with a single allocation.

### What changes are included in this PR?

Change of the `ChunkResolver::ResolveMany()` signature.

### Are these changes tested?

Yes, by the tests in `chunked_array_test.cc`

### Are there any user-facing changes?

No because `ChunkResolver` is still in the `internal` namespace.
* GitHub Issue: #43927

Authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Signed-off-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
---
 cpp/src/arrow/chunk_resolver.cc     | 74 ++++++++++++++---------------
 cpp/src/arrow/chunk_resolver.h      | 61 +++++++++++++-----------
 cpp/src/arrow/chunked_array_test.cc | 29 ++++++-----
 3 files changed, 85 insertions(+), 79 deletions(-)

diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc
index 55eec53ced1..85412748074 100644
--- a/cpp/src/arrow/chunk_resolver.cc
+++ b/cpp/src/arrow/chunk_resolver.cc
@@ -60,42 +60,38 @@ inline std::vector<int64_t> MakeChunksOffsets(const std::vector<T>& chunks) {
 template <typename IndexType>
 void ResolveManyInline(size_t num_offsets, const int64_t* signed_offsets,
                        int64_t n_indices, const IndexType* logical_index_vec,
-                       IndexType* out_chunk_index_vec, IndexType chunk_hint,
-                       IndexType* out_index_in_chunk_vec) {
+                       TypedChunkLocation<IndexType>* out_chunk_location_vec,
+                       IndexType chunk_hint) {
   auto* offsets = reinterpret_cast<const uint64_t*>(signed_offsets);
   const auto num_chunks = static_cast<IndexType>(num_offsets - 1);
   // chunk_hint in [0, num_offsets) per the precondition.
   for (int64_t i = 0; i < n_indices; i++) {
-    const auto index = static_cast<uint64_t>(logical_index_vec[i]);
+    auto typed_logical_index = logical_index_vec[i];
+    const auto index = static_cast<uint64_t>(typed_logical_index);
+    // use or update chunk_hint
     if (index >= offsets[chunk_hint] &&
         (chunk_hint == num_chunks || index < offsets[chunk_hint + 1])) {
-      out_chunk_index_vec[i] = chunk_hint;  // hint is correct!
-      continue;
+      // hint is correct!
+    } else {
+      // lo < hi is guaranteed by `num_offsets = chunks.size() + 1`
+      auto chunk_index =
+          ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets);
+      chunk_hint = static_cast<IndexType>(chunk_index);
     }
-    // lo < hi is guaranteed by `num_offsets = chunks.size() + 1`
-    auto chunk_index =
-        ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets);
-    chunk_hint = static_cast<IndexType>(chunk_index);
-    out_chunk_index_vec[i] = chunk_hint;
-  }
-  if (out_index_in_chunk_vec != NULLPTR) {
-    for (int64_t i = 0; i < n_indices; i++) {
-      auto logical_index = logical_index_vec[i];
-      auto chunk_index = out_chunk_index_vec[i];
-      // chunk_index is in [0, chunks.size()] no matter what the
-      // value of logical_index is, so it's always safe to dereference
-      // offset_ as it contains chunks.size()+1 values.
-      out_index_in_chunk_vec[i] =
-          logical_index - static_cast<IndexType>(offsets[chunk_index]);
+    out_chunk_location_vec[i].chunk_index = chunk_hint;
+    // chunk_index is in [0, chunks.size()] no matter what the
+    // value of logical_index is, so it's always safe to dereference
+    // offset_ as it contains chunks.size()+1 values.
+    out_chunk_location_vec[i].index_in_chunk =
+        typed_logical_index - static_cast<IndexType>(offsets[chunk_hint]);
 #if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER)
-      // Make it more likely that Valgrind/ASAN can catch an invalid memory
-      // access by poisoning out_index_in_chunk_vec[i] when the logical
-      // index is out-of-bounds.
-      if (chunk_index == num_chunks) {
-        out_index_in_chunk_vec[i] = std::numeric_limits<IndexType>::max();
-      }
-#endif
+    // Make it more likely that Valgrind/ASAN can catch an invalid memory
+    // access by poisoning the index-in-chunk value when the logical
+    // index is out-of-bounds.
+    if (chunk_hint == num_chunks) {
+      out_chunk_location_vec[i].index_in_chunk = std::numeric_limits<IndexType>::max();
     }
+#endif
   }
 }
 
@@ -130,31 +126,31 @@ ChunkResolver& ChunkResolver::operator=(const ChunkResolver& other) noexcept {
 }
 
 void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint8_t* logical_index_vec,
-                                    uint8_t* out_chunk_index_vec, uint8_t chunk_hint,
-                                    uint8_t* out_index_in_chunk_vec) const {
+                                    TypedChunkLocation<uint8_t>* out_chunk_location_vec,
+                                    uint8_t chunk_hint) const {
   ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec,
-                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+                    out_chunk_location_vec, chunk_hint);
 }
 
 void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint32_t* logical_index_vec,
-                                    uint32_t* out_chunk_index_vec, uint32_t chunk_hint,
-                                    uint32_t* out_index_in_chunk_vec) const {
+                                    TypedChunkLocation<uint32_t>* out_chunk_location_vec,
+                                    uint32_t chunk_hint) const {
   ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec,
-                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+                    out_chunk_location_vec, chunk_hint);
 }
 
 void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint16_t* logical_index_vec,
-                                    uint16_t* out_chunk_index_vec, uint16_t chunk_hint,
-                                    uint16_t* out_index_in_chunk_vec) const {
+                                    TypedChunkLocation<uint16_t>* out_chunk_location_vec,
+                                    uint16_t chunk_hint) const {
   ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec,
-                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+                    out_chunk_location_vec, chunk_hint);
 }
 
 void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_index_vec,
-                                    uint64_t* out_chunk_index_vec, uint64_t chunk_hint,
-                                    uint64_t* out_index_in_chunk_vec) const {
+                                    TypedChunkLocation<uint64_t>* out_chunk_location_vec,
+                                    uint64_t chunk_hint) const {
   ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec,
-                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+                    out_chunk_location_vec, chunk_hint);
 }
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h
index a2a3d5a8642..83fda62387f 100644
--- a/cpp/src/arrow/chunk_resolver.h
+++ b/cpp/src/arrow/chunk_resolver.h
@@ -31,28 +31,34 @@ namespace arrow::internal {
 
 struct ChunkResolver;
 
-struct ChunkLocation {
+template <typename IndexType>
+struct TypedChunkLocation {
   /// \brief Index of the chunk in the array of chunks
   ///
   /// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used
   /// to represent out-of-bounds locations.
-  int64_t chunk_index = 0;
+  IndexType chunk_index = 0;
 
   /// \brief Index of the value in the chunk
   ///
   /// The value is UNDEFINED if chunk_index >= chunks.size()
-  int64_t index_in_chunk = 0;
+  IndexType index_in_chunk = 0;
 
-  ChunkLocation() = default;
+  TypedChunkLocation() = default;
 
-  ChunkLocation(int64_t chunk_index, int64_t index_in_chunk)
-      : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {}
+  TypedChunkLocation(IndexType chunk_index, IndexType index_in_chunk)
+      : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {
+    static_assert(sizeof(TypedChunkLocation<IndexType>) == 2 * sizeof(IndexType));
+    static_assert(alignof(TypedChunkLocation<IndexType>) == alignof(IndexType));
+  }
 
-  bool operator==(ChunkLocation other) const {
+  bool operator==(TypedChunkLocation other) const {
     return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk;
   }
 };
 
+using ChunkLocation = TypedChunkLocation<int64_t>;
+
 /// \brief An utility that incrementally resolves logical indices into
 /// physical indices in a chunked array.
 struct ARROW_EXPORT ChunkResolver {
@@ -144,26 +150,25 @@ struct ARROW_EXPORT ChunkResolver {
   ///
   /// \pre 0 <= logical_index_vec[i] < logical_array_length()
   ///      (for well-defined and valid chunk index results)
-  /// \pre out_chunk_index_vec has space for `n_indices`
+  /// \pre out_chunk_location_vec has space for `n_indices` locations
   /// \pre chunk_hint in [0, chunks.size()]
-  /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n)
+  /// \post out_chunk_location_vec[i].chunk_index in [0, chunks.size()] for i in [0, n)
   /// \post if logical_index_vec[i] >= chunked_array.length(), then
-  ///       out_chunk_index_vec[i] == chunks.size()
-  ///       and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds)
-  /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and
-  ///       out_index_in_chunk_vec[i] are UNDEFINED
+  ///       out_chunk_location_vec[i].chunk_index == chunks.size()
+  ///       and out_chunk_location_vec[i].index_in_chunk is UNDEFINED (can be
+  ///       out-of-bounds)
+  /// \post if logical_index_vec[i] < 0, then both values in out_chunk_index_vec[i]
+  ///       are UNDEFINED
   ///
   /// \param n_indices The number of logical indices to resolve
   /// \param logical_index_vec The logical indices to resolve
-  /// \param out_chunk_index_vec The output array where the chunk indices will be written
+  /// \param out_chunk_location_vec The output array where the locations will be written
   /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany
-  /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the
-  ///                               within-chunk indices will be written
   /// \return false iff chunks.size() > std::numeric_limits<IndexType>::max()
   template <typename IndexType>
   [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec,
-                                 IndexType* out_chunk_index_vec, IndexType chunk_hint = 0,
-                                 IndexType* out_index_in_chunk_vec = NULLPTR) const {
+                                 TypedChunkLocation<IndexType>* out_chunk_location_vec,
+                                 IndexType chunk_hint = 0) const {
     if constexpr (sizeof(IndexType) < sizeof(uint64_t)) {
       // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()).
       constexpr uint64_t kMaxIndexTypeValue = std::numeric_limits<IndexType>::max();
@@ -188,13 +193,11 @@ struct ARROW_EXPORT ChunkResolver {
       // logical index in the chunked array.
       using U = std::make_unsigned_t<IndexType>;
       ResolveManyImpl(n_indices, reinterpret_cast<const U*>(logical_index_vec),
-                      reinterpret_cast<U*>(out_chunk_index_vec),
-                      static_cast<U>(chunk_hint),
-                      reinterpret_cast<U*>(out_index_in_chunk_vec));
+                      reinterpret_cast<TypedChunkLocation<U>*>(out_chunk_location_vec),
+                      static_cast<U>(chunk_hint));
     } else {
       static_assert(std::is_unsigned_v<IndexType>);
-      ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint,
-                      out_index_in_chunk_vec);
+      ResolveManyImpl(n_indices, logical_index_vec, out_chunk_location_vec, chunk_hint);
     }
     return true;
   }
@@ -226,10 +229,14 @@ struct ARROW_EXPORT ChunkResolver {
 
   /// \pre all the pre-conditions of ChunkResolver::ResolveMany()
   /// \pre num_offsets - 1 <= std::numeric_limits<IndexType>::max()
-  void ResolveManyImpl(int64_t, const uint8_t*, uint8_t*, uint8_t, uint8_t*) const;
-  void ResolveManyImpl(int64_t, const uint16_t*, uint16_t*, uint16_t, uint16_t*) const;
-  void ResolveManyImpl(int64_t, const uint32_t*, uint32_t*, uint32_t, uint32_t*) const;
-  void ResolveManyImpl(int64_t, const uint64_t*, uint64_t*, uint64_t, uint64_t*) const;
+  void ResolveManyImpl(int64_t, const uint8_t*, TypedChunkLocation<uint8_t>*,
+                       uint8_t) const;
+  void ResolveManyImpl(int64_t, const uint16_t*, TypedChunkLocation<uint16_t>*,
+                       uint16_t) const;
+  void ResolveManyImpl(int64_t, const uint32_t*, TypedChunkLocation<uint32_t>*,
+                       uint32_t) const;
+  void ResolveManyImpl(int64_t, const uint64_t*, TypedChunkLocation<uint64_t>*,
+                       uint64_t) const;
 
  public:
   /// \brief Find the index of the chunk that contains the logical index.
diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc
index b796e925000..bf9d4af7c7b 100644
--- a/cpp/src/arrow/chunked_array_test.cc
+++ b/cpp/src/arrow/chunked_array_test.cc
@@ -37,6 +37,7 @@ namespace arrow {
 
 using internal::ChunkLocation;
 using internal::ChunkResolver;
+using internal::TypedChunkLocation;
 
 class TestChunkedArray : public ::testing::Test {
  protected:
@@ -380,24 +381,26 @@ class TestChunkResolverMany : public ::testing::Test {
   Result<std::vector<ChunkLocation>> ResolveMany(
       const ChunkResolver& resolver, const std::vector<IndexType>& logical_index_vec) {
     const size_t n = logical_index_vec.size();
-    std::vector<IndexType> chunk_index_vec;
-    chunk_index_vec.resize(n);
-    std::vector<IndexType> index_in_chunk_vec;
-    index_in_chunk_vec.resize(n);
+    std::vector<TypedChunkLocation<IndexType>> chunk_location_vec;
+    chunk_location_vec.resize(n);
     bool valid = resolver.ResolveMany<IndexType>(
-        static_cast<int64_t>(n), logical_index_vec.data(), chunk_index_vec.data(), 0,
-        index_in_chunk_vec.data());
+        static_cast<int64_t>(n), logical_index_vec.data(), chunk_location_vec.data(), 0);
     if (ARROW_PREDICT_FALSE(!valid)) {
       return Status::Invalid("index type doesn't fit possible chunk indexes");
     }
-    std::vector<ChunkLocation> locations;
-    locations.reserve(n);
-    for (size_t i = 0; i < n; i++) {
-      auto chunk_index = static_cast<int64_t>(chunk_index_vec[i]);
-      auto index_in_chunk = static_cast<int64_t>(index_in_chunk_vec[i]);
-      locations.emplace_back(chunk_index, index_in_chunk);
+    if constexpr (std::is_same<decltype(ChunkLocation::chunk_index), IndexType>::value) {
+      return chunk_location_vec;
+    } else {
+      std::vector<ChunkLocation> locations;
+      locations.reserve(n);
+      for (size_t i = 0; i < n; i++) {
+        auto loc = chunk_location_vec[i];
+        auto chunk_index = static_cast<int64_t>(loc.chunk_index);
+        auto index_in_chunk = static_cast<int64_t>(loc.index_in_chunk);
+        locations.emplace_back(chunk_index, index_in_chunk);
+      }
+      return locations;
     }
-    return locations;
   }
 
   void CheckResolveMany(const ChunkResolver& resolver,

From 6ce2af73c8e633b0485ab9d0aa7b729820deebbc Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Tue, 3 Sep 2024 11:27:45 -0300
Subject: [PATCH 123/157] GH-43719: [C++] Clarify the way SIMD-enabled agg
 kernels come from the same code in different compilation units (#43720)

### Rationale for this change

More than once I've been confused about how the `SimdLevel` template parameters on these kernel classes affect dispatching of kernels based on SIMD support detection at runtime [1] given that nothing in the code changes based on the parameters.

What matters is the compilation unit in which the templates are instantiated. Different compilation units get different compilation parameters. The SimdLevel parameters don't really affect the code that gets generated (!), they only serve as a way to avoid duplication of symbols in the compiled objects.

This PR organizes the code to make this more explicit.

[1] https://github.com/apache/arrow/pull/7871#issuecomment-2291615590

### What changes are included in this PR?

 - Introduction of aggregate_basic-inl.h
 - Moving of the impls in `aggregate_basic-inl.h` to an anonymous namespace
 - Grouping of code based on the function they implement (`Sum`, `Mean`, and `MinMax`)

### Are these changes tested?

By the compilation process, existing tests, and benchmarks.

* GitHub Issue: #43719

Lead-authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../arrow/compute/kernels/aggregate_basic.cc  |   37 +-
 .../compute/kernels/aggregate_basic.inc.cc    | 1025 +++++++++++++++++
 .../compute/kernels/aggregate_basic_avx2.cc   |   47 +-
 .../compute/kernels/aggregate_basic_avx512.cc |   51 +-
 .../kernels/aggregate_basic_internal.h        | 1001 +---------------
 5 files changed, 1125 insertions(+), 1036 deletions(-)
 create mode 100644 cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc

diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index c5e0e6fd6e9..b545d8bcc10 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -23,7 +23,9 @@
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/hashing.h"
 
-#include <memory>
+// Include templated definitions for aggregate kernels that must compiled here
+// with the SIMD level configured for this compilation unit in the build.
+#include "arrow/compute/kernels/aggregate_basic.inc.cc"  // NOLINT(build/include)
 
 namespace arrow {
 namespace compute {
@@ -276,11 +278,6 @@ struct SumImplDefault : public SumImpl<ArrowType, SimdLevel::NONE> {
   using SumImpl<ArrowType, SimdLevel::NONE>::SumImpl;
 };
 
-template <typename ArrowType>
-struct MeanImplDefault : public MeanImpl<ArrowType, SimdLevel::NONE> {
-  using MeanImpl<ArrowType, SimdLevel::NONE>::MeanImpl;
-};
-
 Result<std::unique_ptr<KernelState>> SumInit(KernelContext* ctx,
                                              const KernelInitArgs& args) {
   SumLikeInit<SumImplDefault> visitor(
@@ -289,6 +286,14 @@ Result<std::unique_ptr<KernelState>> SumInit(KernelContext* ctx,
   return visitor.Create();
 }
 
+// ----------------------------------------------------------------------
+// Mean implementation
+
+template <typename ArrowType>
+struct MeanImplDefault : public MeanImpl<ArrowType, SimdLevel::NONE> {
+  using MeanImpl<ArrowType, SimdLevel::NONE>::MeanImpl;
+};
+
 Result<std::unique_ptr<KernelState>> MeanInit(KernelContext* ctx,
                                               const KernelInitArgs& args) {
   MeanKernelInit<MeanImplDefault> visitor(
@@ -482,8 +487,8 @@ void AddFirstOrLastAggKernel(ScalarAggregateFunction* func,
 // ----------------------------------------------------------------------
 // MinMax implementation
 
-Result<std::unique_ptr<KernelState>> MinMaxInit(KernelContext* ctx,
-                                                const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> MinMaxInitDefault(KernelContext* ctx,
+                                                       const KernelInitArgs& args) {
   ARROW_ASSIGN_OR_RAISE(TypeHolder out_type,
                         args.kernel->signature->out_type().Resolve(ctx, args.inputs));
   MinMaxInitState<SimdLevel::NONE> visitor(
@@ -1114,14 +1119,14 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
   // Add min max function
   func = std::make_shared<ScalarAggregateFunction>("min_max", Arity::Unary(), min_max_doc,
                                                    &default_scalar_aggregate_options);
-  AddMinMaxKernels(MinMaxInit, {null(), boolean()}, func.get());
-  AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get());
-  AddMinMaxKernels(MinMaxInit, TemporalTypes(), func.get());
-  AddMinMaxKernels(MinMaxInit, BaseBinaryTypes(), func.get());
-  AddMinMaxKernel(MinMaxInit, Type::FIXED_SIZE_BINARY, func.get());
-  AddMinMaxKernel(MinMaxInit, Type::INTERVAL_MONTHS, func.get());
-  AddMinMaxKernel(MinMaxInit, Type::DECIMAL128, func.get());
-  AddMinMaxKernel(MinMaxInit, Type::DECIMAL256, func.get());
+  AddMinMaxKernels(MinMaxInitDefault, {null(), boolean()}, func.get());
+  AddMinMaxKernels(MinMaxInitDefault, NumericTypes(), func.get());
+  AddMinMaxKernels(MinMaxInitDefault, TemporalTypes(), func.get());
+  AddMinMaxKernels(MinMaxInitDefault, BaseBinaryTypes(), func.get());
+  AddMinMaxKernel(MinMaxInitDefault, Type::FIXED_SIZE_BINARY, func.get());
+  AddMinMaxKernel(MinMaxInitDefault, Type::INTERVAL_MONTHS, func.get());
+  AddMinMaxKernel(MinMaxInitDefault, Type::DECIMAL128, func.get());
+  AddMinMaxKernel(MinMaxInitDefault, Type::DECIMAL256, func.get());
   // Add the SIMD variants for min max
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) {
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
new file mode 100644
index 00000000000..f2151e0a9e0
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
@@ -0,0 +1,1025 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// .inc.cc file to be included in compilation unit where kernels are meant to be
+// compiled auto-vectorized by the compiler with different SIMD levels passed
+// as compiler flags.
+//
+// It contains no includes to avoid double inclusion in the compilation unit
+// that includes this .inc.cc file.
+
+#include <cassert>
+#include <cmath>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/align_util.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/decimal.h"
+
+namespace arrow::compute::internal {
+namespace {
+
+// ----------------------------------------------------------------------
+// Sum implementation
+
+template <typename ArrowType, SimdLevel::type SimdLevel,
+          typename ResultType = typename FindAccumulatorType<ArrowType>::Type>
+struct SumImpl : public ScalarAggregator {
+  using ThisType = SumImpl<ArrowType, SimdLevel, ResultType>;
+  using CType = typename TypeTraits<ArrowType>::CType;
+  using SumType = ResultType;
+  using SumCType = typename TypeTraits<SumType>::CType;
+  using OutputType = typename TypeTraits<SumType>::ScalarType;
+
+  SumImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options_)
+      : out_type(std::move(out_type)), options(std::move(options_)) {}
+
+  Status Consume(KernelContext*, const ExecSpan& batch) override {
+    if (batch[0].is_array()) {
+      const ArraySpan& data = batch[0].array;
+      this->count += data.length - data.GetNullCount();
+      this->nulls_observed = this->nulls_observed || data.GetNullCount();
+
+      if (!options.skip_nulls && this->nulls_observed) {
+        // Short-circuit
+        return Status::OK();
+      }
+
+      if (is_boolean_type<ArrowType>::value) {
+        this->sum += GetTrueCount(data);
+      } else {
+        this->sum += SumArray<CType, SumCType, SimdLevel>(data);
+      }
+    } else {
+      const Scalar& data = *batch[0].scalar;
+      this->count += data.is_valid * batch.length;
+      this->nulls_observed = this->nulls_observed || !data.is_valid;
+      if (data.is_valid) {
+        this->sum += internal::UnboxScalar<ArrowType>::Unbox(data) * batch.length;
+      }
+    }
+    return Status::OK();
+  }
+
+  Status MergeFrom(KernelContext*, KernelState&& src) override {
+    const auto& other = checked_cast<const ThisType&>(src);
+    this->count += other.count;
+    this->sum += other.sum;
+    this->nulls_observed = this->nulls_observed || other.nulls_observed;
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override {
+    if ((!options.skip_nulls && this->nulls_observed) ||
+        (this->count < options.min_count)) {
+      out->value = std::make_shared<OutputType>(out_type);
+    } else {
+      out->value = std::make_shared<OutputType>(this->sum, out_type);
+    }
+    return Status::OK();
+  }
+
+  size_t count = 0;
+  bool nulls_observed = false;
+  SumCType sum = 0;
+  std::shared_ptr<DataType> out_type;
+  ScalarAggregateOptions options;
+};
+
+template <typename ArrowType>
+struct NullImpl : public ScalarAggregator {
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  explicit NullImpl(const ScalarAggregateOptions& options_) : options(options_) {}
+
+  Status Consume(KernelContext*, const ExecSpan& batch) override {
+    if (batch[0].is_scalar() || batch[0].array.GetNullCount() > 0) {
+      // If the batch is a scalar or an array with elements, set is_empty to false
+      is_empty = false;
+    }
+    return Status::OK();
+  }
+
+  Status MergeFrom(KernelContext*, KernelState&& src) override {
+    const auto& other = checked_cast<const NullImpl&>(src);
+    this->is_empty &= other.is_empty;
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override {
+    if ((options.skip_nulls || this->is_empty) && options.min_count == 0) {
+      // Return 0 if the remaining data is empty
+      out->value = output_empty();
+    } else {
+      out->value = MakeNullScalar(TypeTraits<ArrowType>::type_singleton());
+    }
+    return Status::OK();
+  }
+
+  virtual std::shared_ptr<Scalar> output_empty() = 0;
+
+  bool is_empty = true;
+  ScalarAggregateOptions options;
+};
+
+template <typename ArrowType>
+struct NullSumImpl : public NullImpl<ArrowType> {
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  explicit NullSumImpl(const ScalarAggregateOptions& options_)
+      : NullImpl<ArrowType>(options_) {}
+
+  std::shared_ptr<Scalar> output_empty() override {
+    return std::make_shared<ScalarType>(0);
+  }
+};
+
+template <template <typename> class KernelClass>
+struct SumLikeInit {
+  std::unique_ptr<KernelState> state;
+  KernelContext* ctx;
+  std::shared_ptr<DataType> type;
+  const ScalarAggregateOptions& options;
+
+  SumLikeInit(KernelContext* ctx, std::shared_ptr<DataType> type,
+              const ScalarAggregateOptions& options)
+      : ctx(ctx), type(type), options(options) {}
+
+  Status Visit(const DataType&) { return Status::NotImplemented("No sum implemented"); }
+
+  Status Visit(const HalfFloatType&) {
+    return Status::NotImplemented("No sum implemented");
+  }
+
+  Status Visit(const BooleanType&) {
+    auto ty = TypeTraits<typename KernelClass<BooleanType>::SumType>::type_singleton();
+    state.reset(new KernelClass<BooleanType>(ty, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_number<Type, Status> Visit(const Type&) {
+    auto ty = TypeTraits<typename KernelClass<Type>::SumType>::type_singleton();
+    state.reset(new KernelClass<Type>(ty, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_decimal<Type, Status> Visit(const Type&) {
+    state.reset(new KernelClass<Type>(type, options));
+    return Status::OK();
+  }
+
+  virtual Status Visit(const NullType&) {
+    state.reset(new NullSumImpl<Int64Type>(options));
+    return Status::OK();
+  }
+
+  Result<std::unique_ptr<KernelState>> Create() {
+    ARROW_RETURN_NOT_OK(VisitTypeInline(*type, this));
+    return std::move(state);
+  }
+};
+
+// ----------------------------------------------------------------------
+// Mean implementation
+
+template <typename ArrowType, SimdLevel::type SimdLevel, typename Enable = void>
+struct MeanImpl;
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MeanImpl<ArrowType, SimdLevel, enable_if_decimal<ArrowType>>
+    : public SumImpl<ArrowType, SimdLevel> {
+  using SumImpl<ArrowType, SimdLevel>::SumImpl;
+  using SumImpl<ArrowType, SimdLevel>::options;
+  using SumCType = typename SumImpl<ArrowType, SimdLevel>::SumCType;
+  using OutputType = typename SumImpl<ArrowType, SimdLevel>::OutputType;
+
+  template <typename T = ArrowType>
+  Status FinalizeImpl(Datum* out) {
+    if ((!options.skip_nulls && this->nulls_observed) ||
+        (this->count < options.min_count) || (this->count == 0)) {
+      out->value = std::make_shared<OutputType>(this->out_type);
+    } else {
+      SumCType quotient, remainder;
+      ARROW_ASSIGN_OR_RAISE(std::tie(quotient, remainder), this->sum.Divide(this->count));
+      // Round the decimal result based on the remainder
+      remainder.Abs();
+      if (remainder * 2 >= this->count) {
+        if (this->sum >= 0) {
+          quotient += 1;
+        } else {
+          quotient -= 1;
+        }
+      }
+      out->value = std::make_shared<OutputType>(quotient, this->out_type);
+    }
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override { return FinalizeImpl(out); }
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MeanImpl<ArrowType, SimdLevel,
+                std::enable_if_t<!is_decimal_type<ArrowType>::value>>
+    // Override the ResultType of SumImpl because we need to use double for intermediate
+    // sum to prevent integer overflows
+    : public SumImpl<ArrowType, SimdLevel, DoubleType> {
+  using SumImpl<ArrowType, SimdLevel, DoubleType>::SumImpl;
+  using SumImpl<ArrowType, SimdLevel, DoubleType>::options;
+
+  template <typename T = ArrowType>
+  Status FinalizeImpl(Datum* out) {
+    if ((!options.skip_nulls && this->nulls_observed) ||
+        (this->count < options.min_count)) {
+      out->value = std::make_shared<DoubleScalar>();
+    } else {
+      static_assert(std::is_same_v<decltype(this->sum), double>,
+                    "SumCType must be double for numeric inputs");
+      const double mean = this->sum / this->count;
+      out->value = std::make_shared<DoubleScalar>(mean);
+    }
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override { return FinalizeImpl(out); }
+};
+
+template <template <typename> class KernelClass>
+struct MeanKernelInit : public SumLikeInit<KernelClass> {
+  MeanKernelInit(KernelContext* ctx, std::shared_ptr<DataType> type,
+                 const ScalarAggregateOptions& options)
+      : SumLikeInit<KernelClass>(ctx, type, options) {}
+
+  Status Visit(const NullType&) override {
+    this->state.reset(new NullSumImpl<DoubleType>(this->options));
+    return Status::OK();
+  }
+};
+
+// ----------------------------------------------------------------------
+// FirstLast implementation
+
+template <typename ArrowType, typename Enable = void>
+struct FirstLastState {};
+
+template <typename ArrowType>
+struct FirstLastState<ArrowType, enable_if_boolean<ArrowType>> {
+  using ThisType = FirstLastState<ArrowType>;
+  using T = typename ArrowType::c_type;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->first = this->has_values ? this->first : rhs.first;
+    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
+    this->last = rhs.has_values ? rhs.last : this->last;
+    this->last_is_null = rhs.last_is_null;
+    this->has_values |= rhs.has_values;
+    this->has_any_values |= rhs.has_any_values;
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    if (!has_values) {
+      this->first = value;
+      has_values = true;
+    }
+    this->last = value;
+  }
+
+  T first = false;
+  T last = false;
+  bool has_values = false;
+  bool first_is_null = false;
+  bool last_is_null = false;
+  bool has_any_values = false;
+};
+
+template <typename ArrowType>
+struct FirstLastState<ArrowType, enable_if_physical_integer<ArrowType>> {
+  using ThisType = FirstLastState<ArrowType>;
+  using T = typename ArrowType::c_type;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->first = this->has_values ? this->first : rhs.first;
+    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
+    this->last = rhs.has_values ? rhs.last : this->last;
+    this->last_is_null = rhs.last_is_null;
+    this->has_values |= rhs.has_values;
+    this->has_any_values |= rhs.has_any_values;
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    if (!has_values) {
+      this->first = value;
+      has_values = true;
+    }
+    this->last = value;
+  }
+
+  T first = std::numeric_limits<T>::infinity();
+  T last = std::numeric_limits<T>::infinity();
+  bool has_values = false;
+
+  // These are updated in ConsumeScalar and ConsumeArray since null values don't
+  // invoke MergeOne
+  bool first_is_null = false;
+  bool last_is_null = false;
+  // has_any_values indicates whether there is any value (either null or non-null)
+  // (1) has_any_values = false: There is no value aggregated
+  // (2) has_any_values = true, has_values = false: There are only null values aggregated
+  // (3) has_any_values = true, has_values = true: There are both null and non-null values
+  // aggregated
+  bool has_any_values = false;
+};
+
+template <typename ArrowType>
+struct FirstLastState<ArrowType, enable_if_floating_point<ArrowType>> {
+  using ThisType = FirstLastState<ArrowType>;
+  using T = typename ArrowType::c_type;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->first = this->has_values ? this->first : rhs.first;
+    this->last = rhs.has_values ? rhs.last : this->last;
+    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
+    this->last_is_null = rhs.last_is_null;
+    this->has_values |= rhs.has_values;
+    this->has_any_values |= rhs.has_any_values;
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    if (!has_values) {
+      this->first = value;
+      has_values = true;
+    }
+    last = value;
+  }
+
+  T first = std::numeric_limits<T>::infinity();
+  T last = std::numeric_limits<T>::infinity();
+  bool has_values = false;
+  bool first_is_null = false;
+  bool last_is_null = false;
+  bool has_any_values = false;
+};
+
+template <typename ArrowType>
+struct FirstLastState<ArrowType,
+                      enable_if_t<is_base_binary_type<ArrowType>::value ||
+                                  std::is_same<ArrowType, FixedSizeBinaryType>::value>> {
+  using ThisType = FirstLastState<ArrowType>;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->first = this->has_values ? this->first : rhs.first;
+    this->last = rhs.has_values ? rhs.last : this->last;
+    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
+    this->last_is_null = rhs.last_is_null;
+    this->has_values |= rhs.has_values;
+    this->has_any_values |= rhs.has_any_values;
+    return *this;
+  }
+
+  void MergeOne(std::string_view value) {
+    if (!has_values) {
+      first = std::string(value);
+      has_values = true;
+    }
+    last = std::string(value);
+  }
+
+  std::string first = "";
+  std::string last = "";
+  bool has_values = false;
+  bool first_is_null = false;
+  bool last_is_null = false;
+  bool has_any_values = false;
+};
+
+template <typename ArrowType>
+struct FirstLastImpl : public ScalarAggregator {
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+  using ThisType = FirstLastImpl<ArrowType>;
+  using StateType = FirstLastState<ArrowType>;
+
+  FirstLastImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options)
+      : out_type(std::move(out_type)), options(std::move(options)), count(0) {
+    this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
+  }
+
+  Status Consume(KernelContext*, const ExecSpan& batch) override {
+    if (batch[0].is_array()) {
+      return ConsumeArray(batch[0].array);
+    }
+    return ConsumeScalar(*batch[0].scalar);
+  }
+
+  Status ConsumeScalar(const Scalar& scalar) {
+    this->state.has_any_values = true;
+    if (scalar.is_valid) {
+      this->state.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
+    } else {
+      if (!this->state.has_values) {
+        this->state.first_is_null = true;
+      }
+    }
+    this->count += scalar.is_valid;
+    return Status::OK();
+  }
+
+  Status ConsumeArray(const ArraySpan& arr_span) {
+    this->state.has_any_values = true;
+    ArrayType arr(arr_span.ToArrayData());
+    const auto null_count = arr.null_count();
+    this->count += arr.length() - null_count;
+
+    if (null_count == 0) {
+      // If there are no null values, we can just merge
+      // the first and last element
+      this->state.MergeOne(arr.GetView(0));
+      this->state.MergeOne(arr.GetView(arr.length() - 1));
+    } else {
+      int64_t first_i = -1;
+      int64_t last_i = -1;
+
+      if (!this->state.has_values && arr.IsNull(0)) {
+        this->state.first_is_null = true;
+      }
+
+      if (arr.IsNull(arr.length() - 1)) {
+        this->state.last_is_null = true;
+      }
+
+      // Find the first and last non-null value and update state
+      for (int64_t i = 0; i < arr.length(); i++) {
+        if (!arr.IsNull(i)) {
+          first_i = i;
+          break;
+        }
+      }
+      if (first_i >= 0) {
+        for (int64_t i = arr.length() - 1; i >= 0; i--) {
+          if (!arr.IsNull(i)) {
+            last_i = i;
+            break;
+          }
+        }
+        assert(last_i >= first_i);
+        this->state.MergeOne(arr.GetView(first_i));
+        this->state.MergeOne(arr.GetView(last_i));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  Status MergeFrom(KernelContext*, KernelState&& src) override {
+    const auto& other = checked_cast<const ThisType&>(src);
+    this->state += other.state;
+    this->count += other.count;
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override {
+    const auto& struct_type = checked_cast<const StructType&>(*out_type);
+    const auto& child_type = struct_type.field(0)->type();
+    auto null_scalar = MakeNullScalar(child_type);
+
+    std::vector<std::shared_ptr<Scalar>> values;
+
+    if (this->count < options.min_count) {
+      values = {null_scalar, null_scalar};
+    } else {
+      if (state.has_values) {
+        if (options.skip_nulls) {
+          ARROW_ASSIGN_OR_RAISE(auto first_scalar, MakeScalar(child_type, state.first));
+          ARROW_ASSIGN_OR_RAISE(auto last_scalar, MakeScalar(child_type, state.last));
+          values = {first_scalar, last_scalar};
+        } else {
+          ARROW_ASSIGN_OR_RAISE(
+              auto first_scalar,
+              state.first_is_null ? null_scalar : MakeScalar(child_type, state.first));
+          ARROW_ASSIGN_OR_RAISE(
+              auto last_scalar,
+              state.last_is_null ? null_scalar : MakeScalar(child_type, state.last));
+
+          values = {first_scalar, last_scalar};
+        }
+      } else {
+        // If there is no non-null values, we always output null regardless of
+        // skip_null
+        values = {null_scalar, null_scalar};
+      }
+    }
+
+    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
+    return Status::OK();
+  }
+
+  std::shared_ptr<DataType> out_type;
+  ScalarAggregateOptions options;
+  int64_t count;
+  FirstLastState<ArrowType> state;
+};
+
+struct FirstLastInitState {
+  std::unique_ptr<KernelState> state;
+  KernelContext* ctx;
+  const DataType& in_type;
+  std::shared_ptr<DataType> out_type;
+  const ScalarAggregateOptions& options;
+
+  FirstLastInitState(KernelContext* ctx, const DataType& in_type,
+                     const std::shared_ptr<DataType>& out_type,
+                     const ScalarAggregateOptions& options)
+      : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
+
+  Status Visit(const DataType& ty) {
+    return Status::NotImplemented("No first/last implemented for ", ty);
+  }
+
+  Status Visit(const HalfFloatType& ty) {
+    return Status::NotImplemented("No first/last implemented for ", ty);
+  }
+
+  Status Visit(const BooleanType&) {
+    state.reset(new FirstLastImpl<BooleanType>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_physical_integer<Type, Status> Visit(const Type&) {
+    using PhysicalType = typename Type::PhysicalType;
+    state.reset(new FirstLastImpl<PhysicalType>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_physical_floating_point<Type, Status> Visit(const Type&) {
+    using PhysicalType = typename Type::PhysicalType;
+    state.reset(new FirstLastImpl<PhysicalType>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_base_binary<Type, Status> Visit(const Type&) {
+    state.reset(new FirstLastImpl<Type>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_t<std::is_same<Type, FixedSizeBinaryType>::value, Status> Visit(const Type&) {
+    state.reset(new FirstLastImpl<Type>(out_type, options));
+    return Status::OK();
+  }
+
+  Result<std::unique_ptr<KernelState>> Create() {
+    ARROW_RETURN_NOT_OK(VisitTypeInline(in_type, this));
+    return std::move(state);
+  }
+};
+
+// ----------------------------------------------------------------------
+// MinMax implementation
+
+template <typename ArrowType, SimdLevel::type SimdLevel, typename Enable = void>
+struct MinMaxState {};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel, enable_if_boolean<ArrowType>> {
+  using ThisType = MinMaxState<ArrowType, SimdLevel>;
+  using T = typename ArrowType::c_type;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->has_nulls |= rhs.has_nulls;
+    this->min = this->min && rhs.min;
+    this->max = this->max || rhs.max;
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    this->min = this->min && value;
+    this->max = this->max || value;
+  }
+
+  T min = true;
+  T max = false;
+  bool has_nulls = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel, enable_if_integer<ArrowType>> {
+  using ThisType = MinMaxState<ArrowType, SimdLevel>;
+  using T = typename ArrowType::c_type;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->has_nulls |= rhs.has_nulls;
+    this->min = std::min(this->min, rhs.min);
+    this->max = std::max(this->max, rhs.max);
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    this->min = std::min(this->min, value);
+    this->max = std::max(this->max, value);
+  }
+
+  T min = std::numeric_limits<T>::max();
+  T max = std::numeric_limits<T>::min();
+  bool has_nulls = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel, enable_if_floating_point<ArrowType>> {
+  using ThisType = MinMaxState<ArrowType, SimdLevel>;
+  using T = typename ArrowType::c_type;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->has_nulls |= rhs.has_nulls;
+    this->min = std::fmin(this->min, rhs.min);
+    this->max = std::fmax(this->max, rhs.max);
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    this->min = std::fmin(this->min, value);
+    this->max = std::fmax(this->max, value);
+  }
+
+  T min = std::numeric_limits<T>::infinity();
+  T max = -std::numeric_limits<T>::infinity();
+  bool has_nulls = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel, enable_if_decimal<ArrowType>> {
+  using ThisType = MinMaxState<ArrowType, SimdLevel>;
+  using T = typename TypeTraits<ArrowType>::CType;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  MinMaxState() : min(T::GetMaxSentinel()), max(T::GetMinSentinel()) {}
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->has_nulls |= rhs.has_nulls;
+    this->min = std::min(this->min, rhs.min);
+    this->max = std::max(this->max, rhs.max);
+    return *this;
+  }
+
+  void MergeOne(std::string_view value) {
+    MergeOne(T(reinterpret_cast<const uint8_t*>(value.data())));
+  }
+
+  void MergeOne(const T value) {
+    this->min = std::min(this->min, value);
+    this->max = std::max(this->max, value);
+  }
+
+  T min;
+  T max;
+  bool has_nulls = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel,
+                   enable_if_t<is_base_binary_type<ArrowType>::value ||
+                               std::is_same<ArrowType, FixedSizeBinaryType>::value>> {
+  using ThisType = MinMaxState<ArrowType, SimdLevel>;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    if (!this->seen && rhs.seen) {
+      this->min = rhs.min;
+      this->max = rhs.max;
+    } else if (this->seen && rhs.seen) {
+      if (this->min > rhs.min) {
+        this->min = rhs.min;
+      }
+      if (this->max < rhs.max) {
+        this->max = rhs.max;
+      }
+    }
+    this->has_nulls |= rhs.has_nulls;
+    this->seen |= rhs.seen;
+    return *this;
+  }
+
+  void MergeOne(std::string_view value) {
+    if (!seen) {
+      this->min = std::string(value);
+      this->max = std::string(value);
+    } else {
+      if (value < std::string_view(this->min)) {
+        this->min = std::string(value);
+      } else if (value > std::string_view(this->max)) {
+        this->max = std::string(value);
+      }
+    }
+    this->seen = true;
+  }
+
+  std::string min;
+  std::string max;
+  bool has_nulls = false;
+  bool seen = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxImpl : public ScalarAggregator {
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+  using ThisType = MinMaxImpl<ArrowType, SimdLevel>;
+  using StateType = MinMaxState<ArrowType, SimdLevel>;
+
+  MinMaxImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options)
+      : out_type(std::move(out_type)), options(std::move(options)), count(0) {
+    this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
+  }
+
+  Status Consume(KernelContext*, const ExecSpan& batch) override {
+    if (batch[0].is_array()) {
+      return ConsumeArray(batch[0].array);
+    }
+    return ConsumeScalar(*batch[0].scalar);
+  }
+
+  Status ConsumeScalar(const Scalar& scalar) {
+    StateType local;
+    local.has_nulls = !scalar.is_valid;
+    this->count += scalar.is_valid;
+
+    if (!local.has_nulls || options.skip_nulls) {
+      local.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
+    }
+
+    this->state += local;
+    return Status::OK();
+  }
+
+  Status ConsumeArray(const ArraySpan& arr_span) {
+    StateType local;
+
+    ArrayType arr(arr_span.ToArrayData());
+
+    const auto null_count = arr.null_count();
+    local.has_nulls = null_count > 0;
+    this->count += arr.length() - null_count;
+
+    if (!local.has_nulls) {
+      for (int64_t i = 0; i < arr.length(); i++) {
+        local.MergeOne(arr.GetView(i));
+      }
+    } else if (local.has_nulls && options.skip_nulls) {
+      local += ConsumeWithNulls(arr);
+    }
+
+    this->state += local;
+    return Status::OK();
+  }
+
+  Status MergeFrom(KernelContext*, KernelState&& src) override {
+    const auto& other = checked_cast<const ThisType&>(src);
+    this->state += other.state;
+    this->count += other.count;
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override {
+    const auto& struct_type = checked_cast<const StructType&>(*out_type);
+    const auto& child_type = struct_type.field(0)->type();
+
+    std::vector<std::shared_ptr<Scalar>> values;
+    // Physical type != result type
+    if ((state.has_nulls && !options.skip_nulls) || (this->count < options.min_count)) {
+      // (null, null)
+      auto null_scalar = MakeNullScalar(child_type);
+      values = {null_scalar, null_scalar};
+    } else {
+      ARROW_ASSIGN_OR_RAISE(auto min_scalar,
+                            MakeScalar(child_type, std::move(state.min)));
+      ARROW_ASSIGN_OR_RAISE(auto max_scalar,
+                            MakeScalar(child_type, std::move(state.max)));
+      values = {std::move(min_scalar), std::move(max_scalar)};
+    }
+    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
+    return Status::OK();
+  }
+
+  std::shared_ptr<DataType> out_type;
+  ScalarAggregateOptions options;
+  int64_t count;
+  MinMaxState<ArrowType, SimdLevel> state;
+
+ private:
+  StateType ConsumeWithNulls(const ArrayType& arr) const {
+    StateType local;
+    const int64_t length = arr.length();
+    int64_t offset = arr.offset();
+    const uint8_t* bitmap = arr.null_bitmap_data();
+    int64_t idx = 0;
+
+    const auto p = arrow::internal::BitmapWordAlign<1>(bitmap, offset, length);
+    // First handle the leading bits
+    const int64_t leading_bits = p.leading_bits;
+    while (idx < leading_bits) {
+      if (bit_util::GetBit(bitmap, offset)) {
+        local.MergeOne(arr.GetView(idx));
+      }
+      idx++;
+      offset++;
+    }
+
+    // The aligned parts scanned with BitBlockCounter
+    arrow::internal::BitBlockCounter data_counter(bitmap, offset, length - leading_bits);
+    auto current_block = data_counter.NextWord();
+    while (idx < length) {
+      if (current_block.AllSet()) {  // All true values
+        int run_length = 0;
+        // Scan forward until a block that has some false values (or the end)
+        while (current_block.length > 0 && current_block.AllSet()) {
+          run_length += current_block.length;
+          current_block = data_counter.NextWord();
+        }
+        for (int64_t i = 0; i < run_length; i++) {
+          local.MergeOne(arr.GetView(idx + i));
+        }
+        idx += run_length;
+        offset += run_length;
+        // The current_block already computed, advance to next loop
+        continue;
+      } else if (!current_block.NoneSet()) {  // Some values are null
+        BitmapReader reader(arr.null_bitmap_data(), offset, current_block.length);
+        for (int64_t i = 0; i < current_block.length; i++) {
+          if (reader.IsSet()) {
+            local.MergeOne(arr.GetView(idx + i));
+          }
+          reader.Next();
+        }
+
+        idx += current_block.length;
+        offset += current_block.length;
+      } else {  // All null values
+        idx += current_block.length;
+        offset += current_block.length;
+      }
+      current_block = data_counter.NextWord();
+    }
+
+    return local;
+  }
+};
+
+template <SimdLevel::type SimdLevel>
+struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
+  using StateType = MinMaxState<BooleanType, SimdLevel>;
+  using ArrayType = typename TypeTraits<BooleanType>::ArrayType;
+  using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
+  using MinMaxImpl<BooleanType, SimdLevel>::options;
+
+  Status Consume(KernelContext*, const ExecSpan& batch) override {
+    if (ARROW_PREDICT_FALSE(batch[0].is_scalar())) {
+      return ConsumeScalar(checked_cast<const BooleanScalar&>(*batch[0].scalar));
+    }
+    StateType local;
+    ArrayType arr(batch[0].array.ToArrayData());
+
+    const auto arr_length = arr.length();
+    const auto null_count = arr.null_count();
+    const auto valid_count = arr_length - null_count;
+
+    local.has_nulls = null_count > 0;
+    this->count += valid_count;
+    if (!local.has_nulls || options.skip_nulls) {
+      const auto true_count = arr.true_count();
+      const auto false_count = valid_count - true_count;
+      local.max = true_count > 0;
+      local.min = false_count == 0;
+    }
+
+    this->state += local;
+    return Status::OK();
+  }
+
+  Status ConsumeScalar(const BooleanScalar& scalar) {
+    StateType local;
+
+    local.has_nulls = !scalar.is_valid;
+    this->count += scalar.is_valid;
+    if (!local.has_nulls || options.skip_nulls) {
+      const int true_count = scalar.is_valid && scalar.value;
+      const int false_count = scalar.is_valid && !scalar.value;
+      local.max = true_count > 0;
+      local.min = false_count == 0;
+    }
+
+    this->state += local;
+    return Status::OK();
+  }
+};
+
+struct NullMinMaxImpl : public ScalarAggregator {
+  Status Consume(KernelContext*, const ExecSpan& batch) override { return Status::OK(); }
+
+  Status MergeFrom(KernelContext*, KernelState&& src) override { return Status::OK(); }
+
+  Status Finalize(KernelContext*, Datum* out) override {
+    std::vector<std::shared_ptr<Scalar>> values{std::make_shared<NullScalar>(),
+                                                std::make_shared<NullScalar>()};
+    out->value = std::make_shared<StructScalar>(
+        std::move(values), struct_({field("min", null()), field("max", null())}));
+    return Status::OK();
+  }
+};
+
+template <SimdLevel::type SimdLevel>
+struct MinMaxInitState {
+  std::unique_ptr<KernelState> state;
+  KernelContext* ctx;
+  const DataType& in_type;
+  std::shared_ptr<DataType> out_type;
+  const ScalarAggregateOptions& options;
+
+  MinMaxInitState(KernelContext* ctx, const DataType& in_type,
+                  const std::shared_ptr<DataType>& out_type,
+                  const ScalarAggregateOptions& options)
+      : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
+
+  Status Visit(const DataType& ty) {
+    return Status::NotImplemented("No min/max implemented for ", ty);
+  }
+
+  Status Visit(const HalfFloatType& ty) {
+    return Status::NotImplemented("No min/max implemented for ", ty);
+  }
+
+  Status Visit(const NullType&) {
+    state.reset(new NullMinMaxImpl());
+    return Status::OK();
+  }
+
+  Status Visit(const BooleanType&) {
+    state.reset(new BooleanMinMaxImpl<SimdLevel>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_physical_integer<Type, Status> Visit(const Type&) {
+    using PhysicalType = typename Type::PhysicalType;
+    state.reset(new MinMaxImpl<PhysicalType, SimdLevel>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_floating_point<Type, Status> Visit(const Type&) {
+    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_base_binary<Type, Status> Visit(const Type&) {
+    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_fixed_size_binary<Type, Status> Visit(const Type&) {
+    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
+    return Status::OK();
+  }
+
+  Result<std::unique_ptr<KernelState>> Create() {
+    ARROW_RETURN_NOT_OK(VisitTypeInline(in_type, this));
+    return std::move(state);
+  }
+};
+
+}  // namespace
+}  // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc
index 03b45107eec..a1a6a95c5e1 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc
@@ -17,6 +17,10 @@
 
 #include "arrow/compute/kernels/aggregate_basic_internal.h"
 
+// Include templated definitions for aggregate kernels that must compiled here
+// with the SIMD level configured for this compilation unit in the build.
+#include "arrow/compute/kernels/aggregate_basic.inc.cc"  // NOLINT(build/include)
+
 namespace arrow {
 namespace compute {
 namespace internal {
@@ -24,16 +28,13 @@ namespace internal {
 // ----------------------------------------------------------------------
 // Sum implementation
 
+namespace {
+
 template <typename ArrowType>
 struct SumImplAvx2 : public SumImpl<ArrowType, SimdLevel::AVX2> {
   using SumImpl<ArrowType, SimdLevel::AVX2>::SumImpl;
 };
 
-template <typename ArrowType>
-struct MeanImplAvx2 : public MeanImpl<ArrowType, SimdLevel::AVX2> {
-  using MeanImpl<ArrowType, SimdLevel::AVX2>::MeanImpl;
-};
-
 Result<std::unique_ptr<KernelState>> SumInitAvx2(KernelContext* ctx,
                                                  const KernelInitArgs& args) {
   SumLikeInit<SumImplAvx2> visitor(
@@ -42,6 +43,24 @@ Result<std::unique_ptr<KernelState>> SumInitAvx2(KernelContext* ctx,
   return visitor.Create();
 }
 
+}  // namespace
+
+void AddSumAvx2AggKernels(ScalarAggregateFunction* func) {
+  AddBasicAggKernels(SumInitAvx2, SignedIntTypes(), int64(), func, SimdLevel::AVX2);
+  AddBasicAggKernels(SumInitAvx2, UnsignedIntTypes(), uint64(), func, SimdLevel::AVX2);
+  AddBasicAggKernels(SumInitAvx2, FloatingPointTypes(), float64(), func, SimdLevel::AVX2);
+}
+
+// ----------------------------------------------------------------------
+// Mean implementation
+
+namespace {
+
+template <typename ArrowType>
+struct MeanImplAvx2 : public MeanImpl<ArrowType, SimdLevel::AVX2> {
+  using MeanImpl<ArrowType, SimdLevel::AVX2>::MeanImpl;
+};
+
 Result<std::unique_ptr<KernelState>> MeanInitAvx2(KernelContext* ctx,
                                                   const KernelInitArgs& args) {
   SumLikeInit<MeanImplAvx2> visitor(
@@ -50,9 +69,17 @@ Result<std::unique_ptr<KernelState>> MeanInitAvx2(KernelContext* ctx,
   return visitor.Create();
 }
 
+}  // namespace
+
+void AddMeanAvx2AggKernels(ScalarAggregateFunction* func) {
+  AddBasicAggKernels(MeanInitAvx2, NumericTypes(), float64(), func, SimdLevel::AVX2);
+}
+
 // ----------------------------------------------------------------------
 // MinMax implementation
 
+namespace {
+
 Result<std::unique_ptr<KernelState>> MinMaxInitAvx2(KernelContext* ctx,
                                                     const KernelInitArgs& args) {
   ARROW_ASSIGN_OR_RAISE(TypeHolder out_type,
@@ -63,15 +90,7 @@ Result<std::unique_ptr<KernelState>> MinMaxInitAvx2(KernelContext* ctx,
   return visitor.Create();
 }
 
-void AddSumAvx2AggKernels(ScalarAggregateFunction* func) {
-  AddBasicAggKernels(SumInitAvx2, SignedIntTypes(), int64(), func, SimdLevel::AVX2);
-  AddBasicAggKernels(SumInitAvx2, UnsignedIntTypes(), uint64(), func, SimdLevel::AVX2);
-  AddBasicAggKernels(SumInitAvx2, FloatingPointTypes(), float64(), func, SimdLevel::AVX2);
-}
-
-void AddMeanAvx2AggKernels(ScalarAggregateFunction* func) {
-  AddBasicAggKernels(MeanInitAvx2, NumericTypes(), float64(), func, SimdLevel::AVX2);
-}
+}  // namespace
 
 void AddMinMaxAvx2AggKernels(ScalarAggregateFunction* func) {
   // Enable int types for AVX2 variants.
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc
index 05356e0aa5e..9dc490937a6 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc
@@ -17,6 +17,10 @@
 
 #include "arrow/compute/kernels/aggregate_basic_internal.h"
 
+// Include templated definitions for aggregate kernels that must compiled here
+// with the SIMD level configured for this compilation unit in the build.
+#include "arrow/compute/kernels/aggregate_basic.inc.cc"  // NOLINT(build/include)
+
 namespace arrow {
 namespace compute {
 namespace internal {
@@ -24,16 +28,13 @@ namespace internal {
 // ----------------------------------------------------------------------
 // Sum implementation
 
+namespace {
+
 template <typename ArrowType>
 struct SumImplAvx512 : public SumImpl<ArrowType, SimdLevel::AVX512> {
   using SumImpl<ArrowType, SimdLevel::AVX512>::SumImpl;
 };
 
-template <typename ArrowType>
-struct MeanImplAvx512 : public MeanImpl<ArrowType, SimdLevel::AVX512> {
-  using MeanImpl<ArrowType, SimdLevel::AVX512>::MeanImpl;
-};
-
 Result<std::unique_ptr<KernelState>> SumInitAvx512(KernelContext* ctx,
                                                    const KernelInitArgs& args) {
   SumLikeInit<SumImplAvx512> visitor(
@@ -42,6 +43,26 @@ Result<std::unique_ptr<KernelState>> SumInitAvx512(KernelContext* ctx,
   return visitor.Create();
 }
 
+}  // namespace
+
+void AddSumAvx512AggKernels(ScalarAggregateFunction* func) {
+  AddBasicAggKernels(SumInitAvx512, SignedIntTypes(), int64(), func, SimdLevel::AVX512);
+  AddBasicAggKernels(SumInitAvx512, UnsignedIntTypes(), uint64(), func,
+                     SimdLevel::AVX512);
+  AddBasicAggKernels(SumInitAvx512, FloatingPointTypes(), float64(), func,
+                     SimdLevel::AVX512);
+}
+
+// ----------------------------------------------------------------------
+// Mean implementation
+
+namespace {
+
+template <typename ArrowType>
+struct MeanImplAvx512 : public MeanImpl<ArrowType, SimdLevel::AVX512> {
+  using MeanImpl<ArrowType, SimdLevel::AVX512>::MeanImpl;
+};
+
 Result<std::unique_ptr<KernelState>> MeanInitAvx512(KernelContext* ctx,
                                                     const KernelInitArgs& args) {
   SumLikeInit<MeanImplAvx512> visitor(
@@ -50,9 +71,17 @@ Result<std::unique_ptr<KernelState>> MeanInitAvx512(KernelContext* ctx,
   return visitor.Create();
 }
 
+}  // namespace
+
+void AddMeanAvx512AggKernels(ScalarAggregateFunction* func) {
+  AddBasicAggKernels(MeanInitAvx512, NumericTypes(), float64(), func, SimdLevel::AVX512);
+}
+
 // ----------------------------------------------------------------------
 // MinMax implementation
 
+namespace {
+
 Result<std::unique_ptr<KernelState>> MinMaxInitAvx512(KernelContext* ctx,
                                                       const KernelInitArgs& args) {
   ARROW_ASSIGN_OR_RAISE(TypeHolder out_type,
@@ -63,17 +92,7 @@ Result<std::unique_ptr<KernelState>> MinMaxInitAvx512(KernelContext* ctx,
   return visitor.Create();
 }
 
-void AddSumAvx512AggKernels(ScalarAggregateFunction* func) {
-  AddBasicAggKernels(SumInitAvx512, SignedIntTypes(), int64(), func, SimdLevel::AVX512);
-  AddBasicAggKernels(SumInitAvx512, UnsignedIntTypes(), uint64(), func,
-                     SimdLevel::AVX512);
-  AddBasicAggKernels(SumInitAvx512, FloatingPointTypes(), float64(), func,
-                     SimdLevel::AVX512);
-}
-
-void AddMeanAvx512AggKernels(ScalarAggregateFunction* func) {
-  AddBasicAggKernels(MeanInitAvx512, NumericTypes(), float64(), func, SimdLevel::AVX512);
-}
+}  // namespace
 
 void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func) {
   // Enable 32/64 int types for avx512 variants, no advantage on 8/16 int.
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
index f08e7aaa538..5cc3a558b1e 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
@@ -17,23 +17,18 @@
 
 #pragma once
 
-#include <cmath>
-#include <type_traits>
-#include <utility>
+#include <memory>
+#include <vector>
 
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernel.h"
 #include "arrow/compute/kernels/codegen_internal.h"
-#include "arrow/compute/kernels/common_internal.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/align_util.h"
-#include "arrow/util/bit_block_counter.h"
-#include "arrow/util/decimal.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/type_fwd.h"
 
 namespace arrow::compute::internal {
 
+// aggregate_basic.cc
+
 void AddBasicAggKernels(KernelInit init,
                         const std::vector<std::shared_ptr<DataType>>& types,
                         std::shared_ptr<DataType> out_ty, ScalarAggregateFunction* func,
@@ -47,990 +42,16 @@ void AddMinMaxKernel(KernelInit init, internal::detail::GetTypeId get_id,
                      ScalarAggregateFunction* func,
                      SimdLevel::type simd_level = SimdLevel::NONE);
 
-// SIMD variants for kernels
+// aggregate_basic_avx2.cc
+
 void AddSumAvx2AggKernels(ScalarAggregateFunction* func);
 void AddMeanAvx2AggKernels(ScalarAggregateFunction* func);
 void AddMinMaxAvx2AggKernels(ScalarAggregateFunction* func);
 
+// aggregate_basic_avx512.cc
+
 void AddSumAvx512AggKernels(ScalarAggregateFunction* func);
 void AddMeanAvx512AggKernels(ScalarAggregateFunction* func);
 void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func);
 
-// ----------------------------------------------------------------------
-// Sum implementation
-
-template <typename ArrowType, SimdLevel::type SimdLevel,
-          typename ResultType = typename FindAccumulatorType<ArrowType>::Type>
-struct SumImpl : public ScalarAggregator {
-  using ThisType = SumImpl<ArrowType, SimdLevel, ResultType>;
-  using CType = typename TypeTraits<ArrowType>::CType;
-  using SumType = ResultType;
-  using SumCType = typename TypeTraits<SumType>::CType;
-  using OutputType = typename TypeTraits<SumType>::ScalarType;
-
-  SumImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options_)
-      : out_type(std::move(out_type)), options(std::move(options_)) {}
-
-  Status Consume(KernelContext*, const ExecSpan& batch) override {
-    if (batch[0].is_array()) {
-      const ArraySpan& data = batch[0].array;
-      this->count += data.length - data.GetNullCount();
-      this->nulls_observed = this->nulls_observed || data.GetNullCount();
-
-      if (!options.skip_nulls && this->nulls_observed) {
-        // Short-circuit
-        return Status::OK();
-      }
-
-      if (is_boolean_type<ArrowType>::value) {
-        this->sum += GetTrueCount(data);
-      } else {
-        this->sum += SumArray<CType, SumCType, SimdLevel>(data);
-      }
-    } else {
-      const Scalar& data = *batch[0].scalar;
-      this->count += data.is_valid * batch.length;
-      this->nulls_observed = this->nulls_observed || !data.is_valid;
-      if (data.is_valid) {
-        this->sum += internal::UnboxScalar<ArrowType>::Unbox(data) * batch.length;
-      }
-    }
-    return Status::OK();
-  }
-
-  Status MergeFrom(KernelContext*, KernelState&& src) override {
-    const auto& other = checked_cast<const ThisType&>(src);
-    this->count += other.count;
-    this->sum += other.sum;
-    this->nulls_observed = this->nulls_observed || other.nulls_observed;
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override {
-    if ((!options.skip_nulls && this->nulls_observed) ||
-        (this->count < options.min_count)) {
-      out->value = std::make_shared<OutputType>(out_type);
-    } else {
-      out->value = std::make_shared<OutputType>(this->sum, out_type);
-    }
-    return Status::OK();
-  }
-
-  size_t count = 0;
-  bool nulls_observed = false;
-  SumCType sum = 0;
-  std::shared_ptr<DataType> out_type;
-  ScalarAggregateOptions options;
-};
-
-template <typename ArrowType>
-struct NullImpl : public ScalarAggregator {
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  explicit NullImpl(const ScalarAggregateOptions& options_) : options(options_) {}
-
-  Status Consume(KernelContext*, const ExecSpan& batch) override {
-    if (batch[0].is_scalar() || batch[0].array.GetNullCount() > 0) {
-      // If the batch is a scalar or an array with elements, set is_empty to false
-      is_empty = false;
-    }
-    return Status::OK();
-  }
-
-  Status MergeFrom(KernelContext*, KernelState&& src) override {
-    const auto& other = checked_cast<const NullImpl&>(src);
-    this->is_empty &= other.is_empty;
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override {
-    if ((options.skip_nulls || this->is_empty) && options.min_count == 0) {
-      // Return 0 if the remaining data is empty
-      out->value = output_empty();
-    } else {
-      out->value = MakeNullScalar(TypeTraits<ArrowType>::type_singleton());
-    }
-    return Status::OK();
-  }
-
-  virtual std::shared_ptr<Scalar> output_empty() = 0;
-
-  bool is_empty = true;
-  ScalarAggregateOptions options;
-};
-
-template <typename ArrowType>
-struct NullSumImpl : public NullImpl<ArrowType> {
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  explicit NullSumImpl(const ScalarAggregateOptions& options_)
-      : NullImpl<ArrowType>(options_) {}
-
-  std::shared_ptr<Scalar> output_empty() override {
-    return std::make_shared<ScalarType>(0);
-  }
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel, typename Enable = void>
-struct MeanImpl;
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MeanImpl<ArrowType, SimdLevel, enable_if_decimal<ArrowType>>
-    : public SumImpl<ArrowType, SimdLevel> {
-  using SumImpl<ArrowType, SimdLevel>::SumImpl;
-  using SumImpl<ArrowType, SimdLevel>::options;
-  using SumCType = typename SumImpl<ArrowType, SimdLevel>::SumCType;
-  using OutputType = typename SumImpl<ArrowType, SimdLevel>::OutputType;
-
-  template <typename T = ArrowType>
-  Status FinalizeImpl(Datum* out) {
-    if ((!options.skip_nulls && this->nulls_observed) ||
-        (this->count < options.min_count) || (this->count == 0)) {
-      out->value = std::make_shared<OutputType>(this->out_type);
-    } else {
-      SumCType quotient, remainder;
-      ARROW_ASSIGN_OR_RAISE(std::tie(quotient, remainder), this->sum.Divide(this->count));
-      // Round the decimal result based on the remainder
-      remainder.Abs();
-      if (remainder * 2 >= this->count) {
-        if (this->sum >= 0) {
-          quotient += 1;
-        } else {
-          quotient -= 1;
-        }
-      }
-      out->value = std::make_shared<OutputType>(quotient, this->out_type);
-    }
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override { return FinalizeImpl(out); }
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MeanImpl<ArrowType, SimdLevel,
-                std::enable_if_t<!is_decimal_type<ArrowType>::value>>
-    // Override the ResultType of SumImpl because we need to use double for intermediate
-    // sum to prevent integer overflows
-    : public SumImpl<ArrowType, SimdLevel, DoubleType> {
-  using SumImpl<ArrowType, SimdLevel, DoubleType>::SumImpl;
-  using SumImpl<ArrowType, SimdLevel, DoubleType>::options;
-
-  template <typename T = ArrowType>
-  Status FinalizeImpl(Datum* out) {
-    if ((!options.skip_nulls && this->nulls_observed) ||
-        (this->count < options.min_count)) {
-      out->value = std::make_shared<DoubleScalar>();
-    } else {
-      static_assert(std::is_same_v<decltype(this->sum), double>,
-                    "SumCType must be double for numeric inputs");
-      const double mean = this->sum / this->count;
-      out->value = std::make_shared<DoubleScalar>(mean);
-    }
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override { return FinalizeImpl(out); }
-};
-
-template <template <typename> class KernelClass>
-struct SumLikeInit {
-  std::unique_ptr<KernelState> state;
-  KernelContext* ctx;
-  std::shared_ptr<DataType> type;
-  const ScalarAggregateOptions& options;
-
-  SumLikeInit(KernelContext* ctx, std::shared_ptr<DataType> type,
-              const ScalarAggregateOptions& options)
-      : ctx(ctx), type(type), options(options) {}
-
-  Status Visit(const DataType&) { return Status::NotImplemented("No sum implemented"); }
-
-  Status Visit(const HalfFloatType&) {
-    return Status::NotImplemented("No sum implemented");
-  }
-
-  Status Visit(const BooleanType&) {
-    auto ty = TypeTraits<typename KernelClass<BooleanType>::SumType>::type_singleton();
-    state.reset(new KernelClass<BooleanType>(ty, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_number<Type, Status> Visit(const Type&) {
-    auto ty = TypeTraits<typename KernelClass<Type>::SumType>::type_singleton();
-    state.reset(new KernelClass<Type>(ty, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_decimal<Type, Status> Visit(const Type&) {
-    state.reset(new KernelClass<Type>(type, options));
-    return Status::OK();
-  }
-
-  virtual Status Visit(const NullType&) {
-    state.reset(new NullSumImpl<Int64Type>(options));
-    return Status::OK();
-  }
-
-  Result<std::unique_ptr<KernelState>> Create() {
-    RETURN_NOT_OK(VisitTypeInline(*type, this));
-    return std::move(state);
-  }
-};
-
-template <template <typename> class KernelClass>
-struct MeanKernelInit : public SumLikeInit<KernelClass> {
-  MeanKernelInit(KernelContext* ctx, std::shared_ptr<DataType> type,
-                 const ScalarAggregateOptions& options)
-      : SumLikeInit<KernelClass>(ctx, type, options) {}
-
-  Status Visit(const NullType&) override {
-    this->state.reset(new NullSumImpl<DoubleType>(this->options));
-    return Status::OK();
-  }
-};
-
-// ----------------------------------------------------------------------
-// FirstLast implementation
-template <typename ArrowType, typename Enable = void>
-struct FirstLastState {};
-
-template <typename ArrowType>
-struct FirstLastState<ArrowType, enable_if_boolean<ArrowType>> {
-  using ThisType = FirstLastState<ArrowType>;
-  using T = typename ArrowType::c_type;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->first = this->has_values ? this->first : rhs.first;
-    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
-    this->last = rhs.has_values ? rhs.last : this->last;
-    this->last_is_null = rhs.last_is_null;
-    this->has_values |= rhs.has_values;
-    this->has_any_values |= rhs.has_any_values;
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    if (!has_values) {
-      this->first = value;
-      has_values = true;
-    }
-    this->last = value;
-  }
-
-  T first = false;
-  T last = false;
-  bool has_values = false;
-  bool first_is_null = false;
-  bool last_is_null = false;
-  bool has_any_values = false;
-};
-
-template <typename ArrowType>
-struct FirstLastState<ArrowType, enable_if_physical_integer<ArrowType>> {
-  using ThisType = FirstLastState<ArrowType>;
-  using T = typename ArrowType::c_type;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->first = this->has_values ? this->first : rhs.first;
-    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
-    this->last = rhs.has_values ? rhs.last : this->last;
-    this->last_is_null = rhs.last_is_null;
-    this->has_values |= rhs.has_values;
-    this->has_any_values |= rhs.has_any_values;
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    if (!has_values) {
-      this->first = value;
-      has_values = true;
-    }
-    this->last = value;
-  }
-
-  T first = std::numeric_limits<T>::infinity();
-  T last = std::numeric_limits<T>::infinity();
-  bool has_values = false;
-
-  // These are updated in ConsumeScalar and ConsumeArray since null values don't
-  // invoke MergeOne
-  bool first_is_null = false;
-  bool last_is_null = false;
-  // has_any_values indicates whether there is any value (either null or non-null)
-  // (1) has_any_values = false: There is no value aggregated
-  // (2) has_any_values = true, has_values = false: There are only null values aggregated
-  // (3) has_any_values = true, has_values = true: There are both null and non-null values
-  // aggregated
-  bool has_any_values = false;
-};
-
-template <typename ArrowType>
-struct FirstLastState<ArrowType, enable_if_floating_point<ArrowType>> {
-  using ThisType = FirstLastState<ArrowType>;
-  using T = typename ArrowType::c_type;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->first = this->has_values ? this->first : rhs.first;
-    this->last = rhs.has_values ? rhs.last : this->last;
-    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
-    this->last_is_null = rhs.last_is_null;
-    this->has_values |= rhs.has_values;
-    this->has_any_values |= rhs.has_any_values;
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    if (!has_values) {
-      this->first = value;
-      has_values = true;
-    }
-    last = value;
-  }
-
-  T first = std::numeric_limits<T>::infinity();
-  T last = std::numeric_limits<T>::infinity();
-  bool has_values = false;
-  bool first_is_null = false;
-  bool last_is_null = false;
-  bool has_any_values = false;
-};
-
-template <typename ArrowType>
-struct FirstLastState<ArrowType,
-                      enable_if_t<is_base_binary_type<ArrowType>::value ||
-                                  std::is_same<ArrowType, FixedSizeBinaryType>::value>> {
-  using ThisType = FirstLastState<ArrowType>;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->first = this->has_values ? this->first : rhs.first;
-    this->last = rhs.has_values ? rhs.last : this->last;
-    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
-    this->last_is_null = rhs.last_is_null;
-    this->has_values |= rhs.has_values;
-    this->has_any_values |= rhs.has_any_values;
-    return *this;
-  }
-
-  void MergeOne(std::string_view value) {
-    if (!has_values) {
-      first = std::string(value);
-      has_values = true;
-    }
-    last = std::string(value);
-  }
-
-  std::string first = "";
-  std::string last = "";
-  bool has_values = false;
-  bool first_is_null = false;
-  bool last_is_null = false;
-  bool has_any_values = false;
-};
-
-template <typename ArrowType>
-struct FirstLastImpl : public ScalarAggregator {
-  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-  using ThisType = FirstLastImpl<ArrowType>;
-  using StateType = FirstLastState<ArrowType>;
-
-  FirstLastImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options)
-      : out_type(std::move(out_type)), options(std::move(options)), count(0) {
-    this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
-  }
-
-  Status Consume(KernelContext*, const ExecSpan& batch) override {
-    if (batch[0].is_array()) {
-      return ConsumeArray(batch[0].array);
-    }
-    return ConsumeScalar(*batch[0].scalar);
-  }
-
-  Status ConsumeScalar(const Scalar& scalar) {
-    this->state.has_any_values = true;
-    if (scalar.is_valid) {
-      this->state.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
-    } else {
-      if (!this->state.has_values) {
-        this->state.first_is_null = true;
-      }
-    }
-    this->count += scalar.is_valid;
-    return Status::OK();
-  }
-
-  Status ConsumeArray(const ArraySpan& arr_span) {
-    this->state.has_any_values = true;
-    ArrayType arr(arr_span.ToArrayData());
-    const auto null_count = arr.null_count();
-    this->count += arr.length() - null_count;
-
-    if (null_count == 0) {
-      // If there are no null values, we can just merge
-      // the first and last element
-      this->state.MergeOne(arr.GetView(0));
-      this->state.MergeOne(arr.GetView(arr.length() - 1));
-    } else {
-      int64_t first_i = -1;
-      int64_t last_i = -1;
-
-      if (!this->state.has_values && arr.IsNull(0)) {
-        this->state.first_is_null = true;
-      }
-
-      if (arr.IsNull(arr.length() - 1)) {
-        this->state.last_is_null = true;
-      }
-
-      // Find the first and last non-null value and update state
-      for (int64_t i = 0; i < arr.length(); i++) {
-        if (!arr.IsNull(i)) {
-          first_i = i;
-          break;
-        }
-      }
-      if (first_i >= 0) {
-        for (int64_t i = arr.length() - 1; i >= 0; i--) {
-          if (!arr.IsNull(i)) {
-            last_i = i;
-            break;
-          }
-        }
-        DCHECK_GE(last_i, first_i);
-        this->state.MergeOne(arr.GetView(first_i));
-        this->state.MergeOne(arr.GetView(last_i));
-      }
-    }
-
-    return Status::OK();
-  }
-
-  Status MergeFrom(KernelContext*, KernelState&& src) override {
-    const auto& other = checked_cast<const ThisType&>(src);
-    this->state += other.state;
-    this->count += other.count;
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override {
-    const auto& struct_type = checked_cast<const StructType&>(*out_type);
-    const auto& child_type = struct_type.field(0)->type();
-    auto null_scalar = MakeNullScalar(child_type);
-
-    std::vector<std::shared_ptr<Scalar>> values;
-
-    if (this->count < options.min_count) {
-      values = {null_scalar, null_scalar};
-    } else {
-      if (state.has_values) {
-        if (options.skip_nulls) {
-          ARROW_ASSIGN_OR_RAISE(auto first_scalar, MakeScalar(child_type, state.first));
-          ARROW_ASSIGN_OR_RAISE(auto last_scalar, MakeScalar(child_type, state.last));
-          values = {first_scalar, last_scalar};
-        } else {
-          ARROW_ASSIGN_OR_RAISE(
-              auto first_scalar,
-              state.first_is_null ? null_scalar : MakeScalar(child_type, state.first));
-          ARROW_ASSIGN_OR_RAISE(
-              auto last_scalar,
-              state.last_is_null ? null_scalar : MakeScalar(child_type, state.last));
-
-          values = {first_scalar, last_scalar};
-        }
-      } else {
-        // If there is no non-null values, we always output null regardless of
-        // skip_null
-        values = {null_scalar, null_scalar};
-      }
-    }
-
-    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
-    return Status::OK();
-  }
-
-  std::shared_ptr<DataType> out_type;
-  ScalarAggregateOptions options;
-  int64_t count;
-  FirstLastState<ArrowType> state;
-};
-
-// ----------------------------------------------------------------------
-// MinMax implementation
-template <typename ArrowType, SimdLevel::type SimdLevel, typename Enable = void>
-struct MinMaxState {};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxState<ArrowType, SimdLevel, enable_if_boolean<ArrowType>> {
-  using ThisType = MinMaxState<ArrowType, SimdLevel>;
-  using T = typename ArrowType::c_type;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->has_nulls |= rhs.has_nulls;
-    this->min = this->min && rhs.min;
-    this->max = this->max || rhs.max;
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    this->min = this->min && value;
-    this->max = this->max || value;
-  }
-
-  T min = true;
-  T max = false;
-  bool has_nulls = false;
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxState<ArrowType, SimdLevel, enable_if_integer<ArrowType>> {
-  using ThisType = MinMaxState<ArrowType, SimdLevel>;
-  using T = typename ArrowType::c_type;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->has_nulls |= rhs.has_nulls;
-    this->min = std::min(this->min, rhs.min);
-    this->max = std::max(this->max, rhs.max);
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    this->min = std::min(this->min, value);
-    this->max = std::max(this->max, value);
-  }
-
-  T min = std::numeric_limits<T>::max();
-  T max = std::numeric_limits<T>::min();
-  bool has_nulls = false;
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxState<ArrowType, SimdLevel, enable_if_floating_point<ArrowType>> {
-  using ThisType = MinMaxState<ArrowType, SimdLevel>;
-  using T = typename ArrowType::c_type;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->has_nulls |= rhs.has_nulls;
-    this->min = std::fmin(this->min, rhs.min);
-    this->max = std::fmax(this->max, rhs.max);
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    this->min = std::fmin(this->min, value);
-    this->max = std::fmax(this->max, value);
-  }
-
-  T min = std::numeric_limits<T>::infinity();
-  T max = -std::numeric_limits<T>::infinity();
-  bool has_nulls = false;
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxState<ArrowType, SimdLevel, enable_if_decimal<ArrowType>> {
-  using ThisType = MinMaxState<ArrowType, SimdLevel>;
-  using T = typename TypeTraits<ArrowType>::CType;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  MinMaxState() : min(T::GetMaxSentinel()), max(T::GetMinSentinel()) {}
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->has_nulls |= rhs.has_nulls;
-    this->min = std::min(this->min, rhs.min);
-    this->max = std::max(this->max, rhs.max);
-    return *this;
-  }
-
-  void MergeOne(std::string_view value) {
-    MergeOne(T(reinterpret_cast<const uint8_t*>(value.data())));
-  }
-
-  void MergeOne(const T value) {
-    this->min = std::min(this->min, value);
-    this->max = std::max(this->max, value);
-  }
-
-  T min;
-  T max;
-  bool has_nulls = false;
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxState<ArrowType, SimdLevel,
-                   enable_if_t<is_base_binary_type<ArrowType>::value ||
-                               std::is_same<ArrowType, FixedSizeBinaryType>::value>> {
-  using ThisType = MinMaxState<ArrowType, SimdLevel>;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    if (!this->seen && rhs.seen) {
-      this->min = rhs.min;
-      this->max = rhs.max;
-    } else if (this->seen && rhs.seen) {
-      if (this->min > rhs.min) {
-        this->min = rhs.min;
-      }
-      if (this->max < rhs.max) {
-        this->max = rhs.max;
-      }
-    }
-    this->has_nulls |= rhs.has_nulls;
-    this->seen |= rhs.seen;
-    return *this;
-  }
-
-  void MergeOne(std::string_view value) {
-    if (!seen) {
-      this->min = std::string(value);
-      this->max = std::string(value);
-    } else {
-      if (value < std::string_view(this->min)) {
-        this->min = std::string(value);
-      } else if (value > std::string_view(this->max)) {
-        this->max = std::string(value);
-      }
-    }
-    this->seen = true;
-  }
-
-  std::string min;
-  std::string max;
-  bool has_nulls = false;
-  bool seen = false;
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxImpl : public ScalarAggregator {
-  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-  using ThisType = MinMaxImpl<ArrowType, SimdLevel>;
-  using StateType = MinMaxState<ArrowType, SimdLevel>;
-
-  MinMaxImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options)
-      : out_type(std::move(out_type)), options(std::move(options)), count(0) {
-    this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
-  }
-
-  Status Consume(KernelContext*, const ExecSpan& batch) override {
-    if (batch[0].is_array()) {
-      return ConsumeArray(batch[0].array);
-    }
-    return ConsumeScalar(*batch[0].scalar);
-  }
-
-  Status ConsumeScalar(const Scalar& scalar) {
-    StateType local;
-    local.has_nulls = !scalar.is_valid;
-    this->count += scalar.is_valid;
-
-    if (!local.has_nulls || options.skip_nulls) {
-      local.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
-    }
-
-    this->state += local;
-    return Status::OK();
-  }
-
-  Status ConsumeArray(const ArraySpan& arr_span) {
-    StateType local;
-
-    ArrayType arr(arr_span.ToArrayData());
-
-    const auto null_count = arr.null_count();
-    local.has_nulls = null_count > 0;
-    this->count += arr.length() - null_count;
-
-    if (!local.has_nulls) {
-      for (int64_t i = 0; i < arr.length(); i++) {
-        local.MergeOne(arr.GetView(i));
-      }
-    } else if (local.has_nulls && options.skip_nulls) {
-      local += ConsumeWithNulls(arr);
-    }
-
-    this->state += local;
-    return Status::OK();
-  }
-
-  Status MergeFrom(KernelContext*, KernelState&& src) override {
-    const auto& other = checked_cast<const ThisType&>(src);
-    this->state += other.state;
-    this->count += other.count;
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override {
-    const auto& struct_type = checked_cast<const StructType&>(*out_type);
-    const auto& child_type = struct_type.field(0)->type();
-
-    std::vector<std::shared_ptr<Scalar>> values;
-    // Physical type != result type
-    if ((state.has_nulls && !options.skip_nulls) || (this->count < options.min_count)) {
-      // (null, null)
-      auto null_scalar = MakeNullScalar(child_type);
-      values = {null_scalar, null_scalar};
-    } else {
-      ARROW_ASSIGN_OR_RAISE(auto min_scalar,
-                            MakeScalar(child_type, std::move(state.min)));
-      ARROW_ASSIGN_OR_RAISE(auto max_scalar,
-                            MakeScalar(child_type, std::move(state.max)));
-      values = {std::move(min_scalar), std::move(max_scalar)};
-    }
-    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
-    return Status::OK();
-  }
-
-  std::shared_ptr<DataType> out_type;
-  ScalarAggregateOptions options;
-  int64_t count;
-  MinMaxState<ArrowType, SimdLevel> state;
-
- private:
-  StateType ConsumeWithNulls(const ArrayType& arr) const {
-    StateType local;
-    const int64_t length = arr.length();
-    int64_t offset = arr.offset();
-    const uint8_t* bitmap = arr.null_bitmap_data();
-    int64_t idx = 0;
-
-    const auto p = arrow::internal::BitmapWordAlign<1>(bitmap, offset, length);
-    // First handle the leading bits
-    const int64_t leading_bits = p.leading_bits;
-    while (idx < leading_bits) {
-      if (bit_util::GetBit(bitmap, offset)) {
-        local.MergeOne(arr.GetView(idx));
-      }
-      idx++;
-      offset++;
-    }
-
-    // The aligned parts scanned with BitBlockCounter
-    arrow::internal::BitBlockCounter data_counter(bitmap, offset, length - leading_bits);
-    auto current_block = data_counter.NextWord();
-    while (idx < length) {
-      if (current_block.AllSet()) {  // All true values
-        int run_length = 0;
-        // Scan forward until a block that has some false values (or the end)
-        while (current_block.length > 0 && current_block.AllSet()) {
-          run_length += current_block.length;
-          current_block = data_counter.NextWord();
-        }
-        for (int64_t i = 0; i < run_length; i++) {
-          local.MergeOne(arr.GetView(idx + i));
-        }
-        idx += run_length;
-        offset += run_length;
-        // The current_block already computed, advance to next loop
-        continue;
-      } else if (!current_block.NoneSet()) {  // Some values are null
-        BitmapReader reader(arr.null_bitmap_data(), offset, current_block.length);
-        for (int64_t i = 0; i < current_block.length; i++) {
-          if (reader.IsSet()) {
-            local.MergeOne(arr.GetView(idx + i));
-          }
-          reader.Next();
-        }
-
-        idx += current_block.length;
-        offset += current_block.length;
-      } else {  // All null values
-        idx += current_block.length;
-        offset += current_block.length;
-      }
-      current_block = data_counter.NextWord();
-    }
-
-    return local;
-  }
-};
-
-template <SimdLevel::type SimdLevel>
-struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
-  using StateType = MinMaxState<BooleanType, SimdLevel>;
-  using ArrayType = typename TypeTraits<BooleanType>::ArrayType;
-  using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
-  using MinMaxImpl<BooleanType, SimdLevel>::options;
-
-  Status Consume(KernelContext*, const ExecSpan& batch) override {
-    if (ARROW_PREDICT_FALSE(batch[0].is_scalar())) {
-      return ConsumeScalar(checked_cast<const BooleanScalar&>(*batch[0].scalar));
-    }
-    StateType local;
-    ArrayType arr(batch[0].array.ToArrayData());
-
-    const auto arr_length = arr.length();
-    const auto null_count = arr.null_count();
-    const auto valid_count = arr_length - null_count;
-
-    local.has_nulls = null_count > 0;
-    this->count += valid_count;
-    if (!local.has_nulls || options.skip_nulls) {
-      const auto true_count = arr.true_count();
-      const auto false_count = valid_count - true_count;
-      local.max = true_count > 0;
-      local.min = false_count == 0;
-    }
-
-    this->state += local;
-    return Status::OK();
-  }
-
-  Status ConsumeScalar(const BooleanScalar& scalar) {
-    StateType local;
-
-    local.has_nulls = !scalar.is_valid;
-    this->count += scalar.is_valid;
-    if (!local.has_nulls || options.skip_nulls) {
-      const int true_count = scalar.is_valid && scalar.value;
-      const int false_count = scalar.is_valid && !scalar.value;
-      local.max = true_count > 0;
-      local.min = false_count == 0;
-    }
-
-    this->state += local;
-    return Status::OK();
-  }
-};
-
-struct NullMinMaxImpl : public ScalarAggregator {
-  Status Consume(KernelContext*, const ExecSpan& batch) override { return Status::OK(); }
-
-  Status MergeFrom(KernelContext*, KernelState&& src) override { return Status::OK(); }
-
-  Status Finalize(KernelContext*, Datum* out) override {
-    std::vector<std::shared_ptr<Scalar>> values{std::make_shared<NullScalar>(),
-                                                std::make_shared<NullScalar>()};
-    out->value = std::make_shared<StructScalar>(
-        std::move(values), struct_({field("min", null()), field("max", null())}));
-    return Status::OK();
-  }
-};
-
-// First/Last
-
-struct FirstLastInitState {
-  std::unique_ptr<KernelState> state;
-  KernelContext* ctx;
-  const DataType& in_type;
-  std::shared_ptr<DataType> out_type;
-  const ScalarAggregateOptions& options;
-
-  FirstLastInitState(KernelContext* ctx, const DataType& in_type,
-                     const std::shared_ptr<DataType>& out_type,
-                     const ScalarAggregateOptions& options)
-      : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
-
-  Status Visit(const DataType& ty) {
-    return Status::NotImplemented("No first/last implemented for ", ty);
-  }
-
-  Status Visit(const HalfFloatType& ty) {
-    return Status::NotImplemented("No first/last implemented for ", ty);
-  }
-
-  Status Visit(const BooleanType&) {
-    state.reset(new FirstLastImpl<BooleanType>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_physical_integer<Type, Status> Visit(const Type&) {
-    using PhysicalType = typename Type::PhysicalType;
-    state.reset(new FirstLastImpl<PhysicalType>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_physical_floating_point<Type, Status> Visit(const Type&) {
-    using PhysicalType = typename Type::PhysicalType;
-    state.reset(new FirstLastImpl<PhysicalType>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_base_binary<Type, Status> Visit(const Type&) {
-    state.reset(new FirstLastImpl<Type>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_t<std::is_same<Type, FixedSizeBinaryType>::value, Status> Visit(const Type&) {
-    state.reset(new FirstLastImpl<Type>(out_type, options));
-    return Status::OK();
-  }
-
-  Result<std::unique_ptr<KernelState>> Create() {
-    RETURN_NOT_OK(VisitTypeInline(in_type, this));
-    return std::move(state);
-  }
-};
-
-template <SimdLevel::type SimdLevel>
-struct MinMaxInitState {
-  std::unique_ptr<KernelState> state;
-  KernelContext* ctx;
-  const DataType& in_type;
-  std::shared_ptr<DataType> out_type;
-  const ScalarAggregateOptions& options;
-
-  MinMaxInitState(KernelContext* ctx, const DataType& in_type,
-                  const std::shared_ptr<DataType>& out_type,
-                  const ScalarAggregateOptions& options)
-      : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
-
-  Status Visit(const DataType& ty) {
-    return Status::NotImplemented("No min/max implemented for ", ty);
-  }
-
-  Status Visit(const HalfFloatType& ty) {
-    return Status::NotImplemented("No min/max implemented for ", ty);
-  }
-
-  Status Visit(const NullType&) {
-    state.reset(new NullMinMaxImpl());
-    return Status::OK();
-  }
-
-  Status Visit(const BooleanType&) {
-    state.reset(new BooleanMinMaxImpl<SimdLevel>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_physical_integer<Type, Status> Visit(const Type&) {
-    using PhysicalType = typename Type::PhysicalType;
-    state.reset(new MinMaxImpl<PhysicalType, SimdLevel>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_floating_point<Type, Status> Visit(const Type&) {
-    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_base_binary<Type, Status> Visit(const Type&) {
-    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_fixed_size_binary<Type, Status> Visit(const Type&) {
-    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
-    return Status::OK();
-  }
-
-  Result<std::unique_ptr<KernelState>> Create() {
-    RETURN_NOT_OK(VisitTypeInline(in_type, this));
-    return std::move(state);
-  }
-};
-
 }  // namespace arrow::compute::internal

From c455d6b8c4ae2cb22baceb4c27e1325b973d39e1 Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Tue, 3 Sep 2024 16:32:07 +0200
Subject: [PATCH 124/157] GH-43933: [CI] Remove docker-compose warnings
 (#43934)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

Remove `docker-compose` warnings to avoid confusion when running `docker-compose` and see them appear at the top. The two warnings are about:

- `PYTHON_IMAGE_TAG` not having a default value in `.env`
- `version` in `docker-compose.yml` is obsolete and should be removed.

### What changes are included in this PR?

- Add a `PYTHON_IMAGE_TAG` default value in `.env`
- Remove `version` from `docker-compose.yml`

### Are these changes tested?

Yes, the warnings are removed with these changes.

### Are there any user-facing changes?
 No.

* GitHub Issue: #43933

Authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
---
 .env               | 1 +
 docker-compose.yml | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.env b/.env
index af647fc8b7a..c8c236d5ac4 100644
--- a/.env
+++ b/.env
@@ -71,6 +71,7 @@ NUMBA=latest
 NUMPY=latest
 PANDAS=latest
 PYTHON=3.8
+PYTHON_IMAGE_TAG=3.8
 R=4.4
 SPARK=master
 TURBODBC=latest
diff --git a/docker-compose.yml b/docker-compose.yml
index 97d6e1158ea..19a9dd0de39 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -53,8 +53,6 @@
 #
 # See more in cpp/build-support/run-test.sh::print_coredumps
 
-version: '3.5'
-
 x-common: &common
   GITHUB_ACTIONS:
 

From 6f0500e9bd65c9886ba75ff218acb5b9f5b92b08 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Wed, 4 Sep 2024 00:15:55 +0800
Subject: [PATCH 125/157] Geometry value writer could make use of the geometry
 statistics class to populate geometry statistics

---
 cpp/src/parquet/column_writer.cc      |   5 +-
 cpp/src/parquet/column_writer_test.cc | 112 ++++++++++++++++++++++++++
 cpp/src/parquet/geometry_util.h       |   3 +
 cpp/src/parquet/statistics.cc         |  35 +++++++-
 cpp/src/parquet/statistics.h          |   5 +-
 cpp/src/parquet/test_util.h           |   6 +-
 cpp/src/parquet/thrift_internal.h     |  22 +++++
 7 files changed, 182 insertions(+), 6 deletions(-)

diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index f859ec9653f..998d6446237 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -1219,8 +1219,11 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
     // Will be null if not using dictionary, but that's ok
     current_dict_encoder_ = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
 
+    bool is_geometry =
+        (descr_->logical_type() != nullptr && descr_->logical_type()->is_geometry());
+    bool has_sort_order = SortOrder::UNKNOWN != descr_->sort_order();
     if (properties->statistics_enabled(descr_->path()) &&
-        (SortOrder::UNKNOWN != descr_->sort_order())) {
+        (is_geometry || has_sort_order)) {
       page_statistics_ = MakeStatistics<DType>(descr_, allocator_);
       chunk_statistics_ = MakeStatistics<DType>(descr_, allocator_);
     }
diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index c99efd17961..09503705782 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -19,6 +19,7 @@
 #include <utility>
 #include <vector>
 
+#include <_types/_uint32_t.h>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
@@ -385,6 +386,10 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
     return metadata_accessor->encoding_stats();
   }
 
+  std::unique_ptr<ColumnChunkMetaData> metadata_accessor() {
+    return ColumnChunkMetaData::Make(metadata_->contents(), this->descr_);
+  }
+
  protected:
   int64_t values_read_;
   // Keep the reader alive as for ByteArray the lifetime of the ByteArray
@@ -1705,5 +1710,112 @@ TEST(TestColumnWriter, WriteDataPageV2HeaderNullCount) {
   }
 }
 
+// Test writing and reading geometry columns
+class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
+ public:
+  static const char *CRS;
+  static const char* METADATA;
+
+  void SetUpSchema(Repetition::type repetition, int num_columns) override {
+    std::vector<schema::NodePtr> fields;
+
+    for (int i = 0; i < num_columns; ++i) {
+      std::string name = TestColumnName(i);
+      std::shared_ptr<const LogicalType> logical_type = GeometryLogicalType::Make(
+          CRS, LogicalType::GeometryEdges::PLANAR, LogicalType::GeometryEncoding::WKB, METADATA);
+      fields.push_back(schema::PrimitiveNode::Make(name, repetition, logical_type,
+                                                   ByteArrayType::type_num));
+    }
+    node_ = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields);
+    schema_.Init(node_);
+  }
+
+  void GenerateData(int64_t num_values, uint32_t seed = 0) {
+    def_levels_.resize(num_values);
+    values_.resize(num_values);
+
+    uint32_t point_wkb_size = 21;
+    buffer_.resize(num_values * point_wkb_size);
+    uint8_t *ptr = buffer_.data();
+    for (int k = 0; k < num_values; k++) {
+      // Point with coordinates (k, k + 1), encoded as WKB
+      ptr[0] = 0x01;  // 1: little endian
+      uint32_t geom_type = 1;  // 1: POINT (2D)
+      memcpy(&ptr[1], &geom_type, 4);
+      double x = k;
+      double y = k + 1;
+      memcpy(&ptr[5], &x, 8);
+      memcpy(&ptr[13], &y, 8);
+
+      // Set this WKB value to values_[k]
+      values_[k].len = point_wkb_size;
+      values_[k].ptr = ptr;
+      ptr += point_wkb_size;
+    }
+    
+    values_ptr_ = values_.data();
+
+    std::fill(def_levels_.begin(), def_levels_.end(), 1);
+  }
+
+  void TestWriteAndRead(ParquetVersion::type version,
+                        ParquetDataPageVersion data_page_version) {
+    this->SetUpSchema(Repetition::REQUIRED, 1);
+    this->GenerateData(SMALL_SIZE);
+    size_t num_values = this->values_.size();
+    auto writer =
+        this->BuildWriter(num_values, ColumnProperties(), version, data_page_version,
+                          /*enable_checksum*/ false);
+    std::vector<int16_t> definition_levels(num_values, 0);
+    std::vector<int16_t> repetition_levels(num_values, 0);    
+    writer->WriteBatch(this->values_.size(), definition_levels.data(),
+                       repetition_levels.data(), this->values_.data());
+    
+    writer->Close();
+    this->ReadColumn();
+    for (size_t i = 0; i < num_values; i++) {
+      // ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
+      const ByteArray &value = this->values_out_[i];
+      EXPECT_EQ(21, value.len);
+      EXPECT_EQ(1, value.ptr[0]);      
+      uint32_t geom_type = 0;
+      double x = 0;
+      double y = 0;
+      memcpy(&geom_type, &value.ptr[1], 4);
+      memcpy(&x, &value.ptr[5], 8);
+      memcpy(&y, &value.ptr[13], 8);
+      EXPECT_EQ(1, geom_type);
+      EXPECT_DOUBLE_EQ(i, x);
+      EXPECT_DOUBLE_EQ(i + 1, y);
+    }
+
+    auto metadata_accessor = this->metadata_accessor();
+    // auto statistics = metadata_accessor->statistics();
+    
+    // auto metadata_encodings = this->metadata_encodings();
+    // std::set<Encoding::type> metadata_encodings_set{metadata_encodings.begin(),
+    //                                                 metadata_encodings.end()};
+    // EXPECT_EQ(expected_encodings, metadata_encodings_set);
+  }  
+};
+
+const char* TestGeometryValuesWriter::CRS = R"({"id": {"authority": "OGC", "code": "CRS84"}})";
+const char* TestGeometryValuesWriter::METADATA = "test_metadata";
+
+
+TEST_F(TestGeometryValuesWriter, TestWriteAndReadV1) {
+  for (auto data_page_version :
+       {ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) {
+    TestWriteAndRead(ParquetVersion::PARQUET_1_0, data_page_version);
+  }
+}
+
+TEST_F(TestGeometryValuesWriter, TestWriteAndReadV2) {
+  for (auto data_page_version :
+       {ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) {
+    TestWriteAndRead(ParquetVersion::PARQUET_2_4, data_page_version);
+  }  
+}
+
 }  // namespace test
 }  // namespace parquet
diff --git a/cpp/src/parquet/geometry_util.h b/cpp/src/parquet/geometry_util.h
index 4dccc0cf3bf..3cba84016b0 100644
--- a/cpp/src/parquet/geometry_util.h
+++ b/cpp/src/parquet/geometry_util.h
@@ -332,6 +332,7 @@ template <Dimensions::dimensions dims, bool swap, uint32_t chunk_size>
 class WKBSequenceBounder {
  public:
   explicit WKBSequenceBounder(double* chunk) : box_(dims), chunk_(chunk) {}
+  WKBSequenceBounder(const WKBSequenceBounder &) = default;
 
   void ReadPoint(WKBBuffer* src) {
     constexpr uint32_t coord_size = Dimensions::size<dims>();
@@ -398,6 +399,7 @@ class WKBGenericSequenceBounder {
         xyz_swap_(chunk_),
         xym_swap_(chunk_),
         xyzm_swap_(chunk_) {}
+  WKBGenericSequenceBounder(const WKBGenericSequenceBounder &) = default;
 
   void ReadPoint(WKBBuffer* src, Dimensions::dimensions dimensions, bool swap) {
     if (ARROW_PREDICT_TRUE(!swap)) {
@@ -538,6 +540,7 @@ class WKBGenericSequenceBounder {
 class WKBGeometryBounder {
  public:
   WKBGeometryBounder() : box_(Dimensions::XYZM) {}
+  WKBGeometryBounder(const WKBGeometryBounder &) = default;
 
   void ReadGeometry(WKBBuffer* src, bool record_wkb_type = true) {
     uint8_t endian = src->ReadUInt8();
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index eeed161d9e0..e642378d989 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -21,6 +21,7 @@
 #include <cmath>
 #include <cstring>
 #include <limits>
+#include <memory>
 #include <optional>
 #include <type_traits>
 #include <utility>
@@ -39,6 +40,7 @@
 #include "parquet/geometry_util.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
+#include "parquet/types.h"
 
 using arrow::default_memory_pool;
 using arrow::MemoryPool;
@@ -51,6 +53,9 @@ namespace parquet {
 
 class GeometryStatisticsImpl {
  public:
+  GeometryStatisticsImpl() = default;  
+  GeometryStatisticsImpl(const GeometryStatisticsImpl&) = default;
+  
   bool Equals(const GeometryStatisticsImpl& other) const {
     if (is_valid_ != other.is_valid_) {
       return false;
@@ -166,13 +171,17 @@ class GeometryStatisticsImpl {
 
  private:
   geometry::WKBGeometryBounder bounder_;
-  bool is_valid_{};
+  bool is_valid_ = true;
 };
 
 GeometryStatistics::GeometryStatistics() {
   impl_ = std::make_unique<GeometryStatisticsImpl>();
 }
 
+GeometryStatistics::GeometryStatistics(std::unique_ptr<GeometryStatisticsImpl> impl)
+    : impl_(std::move(impl)) {
+}
+
 bool GeometryStatistics::Equals(const GeometryStatistics& other) const {
   return impl_->Equals(*other.impl_);
 }
@@ -197,6 +206,11 @@ std::unique_ptr<GeometryStatistics> GeometryStatistics::Decode(
   return out;
 }
 
+std::shared_ptr<GeometryStatistics> GeometryStatistics::clone() const {
+  std::unique_ptr<GeometryStatisticsImpl> impl = std::make_unique<GeometryStatisticsImpl>(*impl_);
+  return std::make_shared<GeometryStatistics>(std::move(impl));
+}
+
 namespace {
 
 // ----------------------------------------------------------------------
@@ -770,6 +784,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
   bool HasMinMax() const override { return has_min_max_; }
   bool HasNullCount() const override { return has_null_count_; };
   bool HasGeometryStatistics() const override { return geometry_statistics_ != nullptr; }
+  const GeometryStatistics* geometry_statistics() const override { return geometry_statistics_.get(); }
 
   void IncrementNullCount(int64_t n) override {
     statistics_.null_count += n;
@@ -813,7 +828,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     }
 
     if (HasGeometryStatistics() &&
-        !geometry_statistics_->Equals(*other.GeometryStatistics())) {
+        !geometry_statistics_->Equals(*other.geometry_statistics())) {
       return false;
     }
 
@@ -857,6 +872,12 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     if (other.HasMinMax()) {
       SetMinMax(other.min(), other.max());
     }
+
+    if (this->HasGeometryStatistics() && other.HasGeometryStatistics()) {
+      this->geometry_statistics_->Merge(*other.geometry_statistics());
+    } else if (other.HasGeometryStatistics()) {
+      this->geometry_statistics_ = other.geometry_statistics()->clone();
+    }
   }
 
   void Update(const T* values, int64_t num_values, int64_t null_count) override;
@@ -1035,6 +1056,9 @@ void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_values,
 
   if constexpr (std::is_same<T, ByteArray>::value) {
     if (logical_type_ == LogicalType::Type::GEOMETRY) {
+      if (geometry_statistics_ == nullptr) {
+        geometry_statistics_ = std::make_unique<GeometryStatistics>();
+      }
       geometry_statistics_->Update(values, num_values, null_count);
     }
   }
@@ -1131,7 +1155,12 @@ std::shared_ptr<Comparator> DoMakeComparator(Type::type physical_type,
         ParquetException::NYI("Unsigned Compare not implemented");
     }
   } else {
-    throw ParquetException("UNKNOWN Sort Order");
+    if (logical_type == LogicalType::Type::GEOMETRY &&
+        physical_type == Type::BYTE_ARRAY) {
+      return std::make_shared<TypedComparatorImpl<false, ByteArrayType>>();
+    } else {
+      throw ParquetException("UNKNOWN Sort Order");      
+    }
   }
   return nullptr;
 }
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index 9ffbbc109c8..d45024a7fba 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -144,6 +144,7 @@ class GeometryStatisticsImpl;
 class PARQUET_EXPORT GeometryStatistics {
  public:
   GeometryStatistics();
+  explicit GeometryStatistics(std::unique_ptr<GeometryStatisticsImpl> impl);
 
   bool Equals(const GeometryStatistics& other) const;
 
@@ -155,6 +156,8 @@ class PARQUET_EXPORT GeometryStatistics {
 
   bool is_valid() const;
 
+  std::shared_ptr<GeometryStatistics> clone() const;
+
   static std::unique_ptr<GeometryStatistics> Decode(const EncodedGeometryStatistics& encoded);
 
  private:
@@ -311,7 +314,7 @@ class PARQUET_EXPORT Statistics {
 
   virtual bool HasGeometryStatistics() const { return false; };
 
-  virtual const GeometryStatistics* GeometryStatistics() const { return nullptr; }
+  virtual const GeometryStatistics* geometry_statistics() const { return nullptr; }
 
   /// \brief Reset state of object to initial (no data observed) state
   virtual void Reset() = 0;
diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h
index 59728cf53f6..4ce6b5295f4 100644
--- a/cpp/src/parquet/test_util.h
+++ b/cpp/src/parquet/test_util.h
@@ -660,7 +660,7 @@ class PrimitiveTypedTest : public ::testing::Test {
  public:
   using c_type = typename TestType::c_type;
 
-  void SetUpSchema(Repetition::type repetition, int num_columns = 1) {
+  virtual void SetUpSchema(Repetition::type repetition, int num_columns) {
     std::vector<schema::NodePtr> fields;
 
     for (int i = 0; i < num_columns; ++i) {
@@ -672,6 +672,10 @@ class PrimitiveTypedTest : public ::testing::Test {
     schema_.Init(node_);
   }
 
+  void SetUpSchema(Repetition::type repetition) {
+    this->SetUpSchema(repetition, 1);
+  }
+
   void GenerateData(int64_t num_values, uint32_t seed = 0);
   void SetupValuesOut(int64_t num_values);
   void SyncValuesOut();
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index b21b0e07afb..f1ac1f6e5c8 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -348,6 +348,28 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
     statistics.__set_distinct_count(stats.distinct_count);
   }
 
+  if (stats.has_geometry_statistics) {
+    const EncodedGeometryStatistics& encoded_geometry_stats = stats.geometry_statistics();
+    format::GeometryStatistics geometry_statistics;
+    std::vector<int32_t> geometry_types(encoded_geometry_stats.geometry_types.size());
+    std::transform(encoded_geometry_stats.geometry_types.begin(), encoded_geometry_stats.geometry_types.end(),
+                   geometry_types.begin(), [](uint32_t value) {
+                     return static_cast<int32_t>(value);
+                   });
+    geometry_statistics.__set_geometry_types(geometry_types);
+    format::BoundingBox bbox;
+    bbox.__set_xmin(encoded_geometry_stats.xmin);
+    bbox.__set_xmax(encoded_geometry_stats.xmax);
+    bbox.__set_ymin(encoded_geometry_stats.ymin);
+    bbox.__set_ymax(encoded_geometry_stats.ymax);
+    bbox.__set_zmin(encoded_geometry_stats.zmin);
+    bbox.__set_zmax(encoded_geometry_stats.zmax);
+    bbox.__set_mmin(encoded_geometry_stats.mmin);
+    bbox.__set_mmax(encoded_geometry_stats.mmax);    
+    geometry_statistics.__set_bbox(bbox);
+    statistics.__set_geometry_stats(geometry_statistics);
+  }
+
   return statistics;
 }
 

From c052ae08b78f3d3257b994c9a50021a1938eb205 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Wed, 4 Sep 2024 10:59:43 +0800
Subject: [PATCH 126/157] Geometry column writer now populates correct
 statistics

---
 cpp/src/parquet/column_writer_test.cc | 23 ++++++++++++---------
 cpp/src/parquet/metadata.cc           |  6 ++++--
 cpp/src/parquet/statistics.cc         | 29 +++++++++++++++++++++------
 cpp/src/parquet/statistics.h          |  2 +-
 cpp/src/parquet/thrift_internal.h     | 18 ++++++++---------
 5 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index 09503705782..f873bf57151 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -386,8 +386,11 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
     return metadata_accessor->encoding_stats();
   }
 
-  std::unique_ptr<ColumnChunkMetaData> metadata_accessor() {
-    return ColumnChunkMetaData::Make(metadata_->contents(), this->descr_);
+  EncodedStatistics metadata_encoded_stats() {
+    ApplicationVersion app_version(this->writer_properties_->created_by());
+    auto metadata_accessor = ColumnChunkMetaData::Make(
+        metadata_->contents(), this->descr_, default_reader_properties(), &app_version);
+    return metadata_accessor->statistics()->Encode();    
   }
 
  protected:
@@ -1789,13 +1792,15 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
       EXPECT_DOUBLE_EQ(i + 1, y);
     }
 
-    auto metadata_accessor = this->metadata_accessor();
-    // auto statistics = metadata_accessor->statistics();
-    
-    // auto metadata_encodings = this->metadata_encodings();
-    // std::set<Encoding::type> metadata_encodings_set{metadata_encodings.begin(),
-    //                                                 metadata_encodings.end()};
-    // EXPECT_EQ(expected_encodings, metadata_encodings_set);
+    auto encoded_statistics = metadata_encoded_stats();
+    EXPECT_TRUE(encoded_statistics.has_geometry_statistics);
+    auto geometry_statistics = encoded_statistics.geometry_statistics();
+    EXPECT_EQ(1, geometry_statistics.geometry_types.size());
+    EXPECT_EQ(1, geometry_statistics.geometry_types[0]);
+    EXPECT_DOUBLE_EQ(0, geometry_statistics.xmin);
+    EXPECT_DOUBLE_EQ(1, geometry_statistics.ymin);
+    EXPECT_DOUBLE_EQ(99, geometry_statistics.xmax);
+    EXPECT_DOUBLE_EQ(100, geometry_statistics.ymax);    
   }  
 };
 
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 5c7e28aca11..29b934192ef 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -294,8 +294,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
     DCHECK(writer_version_ != nullptr);
     // If the column statistics don't exist or column sort order is unknown
     // we cannot use the column stats
+    auto logical_type = descr_->logical_type();
+    bool is_geometry = (logical_type != nullptr && logical_type->is_geometry());
     if (!column_metadata_->__isset.statistics ||
-        descr_->sort_order() == SortOrder::UNKNOWN) {
+        (descr_->sort_order() == SortOrder::UNKNOWN && !is_geometry)) {
       return false;
     }
     if (possible_stats_ == nullptr) {
@@ -1522,7 +1524,7 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
   }
 
   // Unknown sort order has incorrect stats
-  if (SortOrder::UNKNOWN == sort_order) {
+  if (SortOrder::UNKNOWN == sort_order && !statistics.has_geometry_statistics) {
     return false;
   }
 
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index e642378d989..81af57f622b 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -199,11 +199,8 @@ bool GeometryStatistics::is_valid() const { return impl_->is_valid(); }
 
 EncodedGeometryStatistics GeometryStatistics::Encode() { return impl_->Encode(); }
 
-std::unique_ptr<GeometryStatistics> GeometryStatistics::Decode(
-    const EncodedGeometryStatistics& encoded) {
-  auto out = std::make_unique<GeometryStatistics>();
-  out->impl_->Update(encoded);
-  return out;
+void GeometryStatistics::Decode(const EncodedGeometryStatistics& encoded) {
+  impl_->Update(encoded);
 }
 
 std::shared_ptr<GeometryStatistics> GeometryStatistics::clone() const {
@@ -780,6 +777,22 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     has_min_max_ = has_min_max;
   }
 
+  // Create stats from a thrift Statistics object.
+  TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
+                      const std::string& encoded_max, int64_t num_values,
+                      int64_t null_count, int64_t distinct_count,
+                      const EncodedGeometryStatistics& geometry_statistics,
+                      bool has_min_max, bool has_null_count, bool has_distinct_count,
+                      bool has_geometry_statistics, MemoryPool* pool)
+      : TypedStatisticsImpl(descr, encoded_min, encoded_max, num_values, null_count,
+                            distinct_count, has_min_max, has_null_count,
+                            has_distinct_count, pool) {
+    if (has_geometry_statistics) {
+      geometry_statistics_ = std::make_shared<GeometryStatistics>();
+      geometry_statistics_->Decode(geometry_statistics);
+    }
+  }
+
   bool HasDistinctCount() const override { return has_distinct_count_; };
   bool HasMinMax() const override { return has_min_max_; }
   bool HasNullCount() const override { return has_null_count_; };
@@ -1261,8 +1274,12 @@ std::shared_ptr<Statistics> Statistics::Make(
     MAKE_STATS(INT64, Int64Type);
     MAKE_STATS(FLOAT, FloatType);
     MAKE_STATS(DOUBLE, DoubleType);
-    MAKE_STATS(BYTE_ARRAY, ByteArrayType);
     MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
+    case Type::BYTE_ARRAY:
+      return std::make_shared<TypedStatisticsImpl<ByteArrayType>>(
+          descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
+          geometry_statistics, has_min_max, has_null_count, has_distinct_count,
+          has_geometry_statistics, pool);
     default:
       break;
   }
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index d45024a7fba..a62c860b891 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -158,7 +158,7 @@ class PARQUET_EXPORT GeometryStatistics {
 
   std::shared_ptr<GeometryStatistics> clone() const;
 
-  static std::unique_ptr<GeometryStatistics> Decode(const EncodedGeometryStatistics& encoded);
+  void Decode(const EncodedGeometryStatistics& encoded);
 
  private:
   std::unique_ptr<GeometryStatisticsImpl> impl_;
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index f1ac1f6e5c8..040f11d5a3d 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -351,21 +351,21 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
   if (stats.has_geometry_statistics) {
     const EncodedGeometryStatistics& encoded_geometry_stats = stats.geometry_statistics();
     format::GeometryStatistics geometry_statistics;
-    std::vector<int32_t> geometry_types(encoded_geometry_stats.geometry_types.size());
-    std::transform(encoded_geometry_stats.geometry_types.begin(), encoded_geometry_stats.geometry_types.end(),
-                   geometry_types.begin(), [](uint32_t value) {
-                     return static_cast<int32_t>(value);
-                   });
+    std::vector<int32_t> geometry_types(encoded_geometry_stats.geometry_types.begin(), encoded_geometry_stats.geometry_types.end());
     geometry_statistics.__set_geometry_types(geometry_types);
     format::BoundingBox bbox;
     bbox.__set_xmin(encoded_geometry_stats.xmin);
     bbox.__set_xmax(encoded_geometry_stats.xmax);
     bbox.__set_ymin(encoded_geometry_stats.ymin);
     bbox.__set_ymax(encoded_geometry_stats.ymax);
-    bbox.__set_zmin(encoded_geometry_stats.zmin);
-    bbox.__set_zmax(encoded_geometry_stats.zmax);
-    bbox.__set_mmin(encoded_geometry_stats.mmin);
-    bbox.__set_mmax(encoded_geometry_stats.mmax);    
+    if (encoded_geometry_stats.has_z()) {
+      bbox.__set_zmin(encoded_geometry_stats.zmin);
+      bbox.__set_zmax(encoded_geometry_stats.zmax);
+    }
+    if (encoded_geometry_stats.has_m()) {
+      bbox.__set_mmin(encoded_geometry_stats.mmin);
+      bbox.__set_mmax(encoded_geometry_stats.mmax);    
+    }   
     geometry_statistics.__set_bbox(bbox);
     statistics.__set_geometry_stats(geometry_statistics);
   }

From 3c6b2223f774f0913a7fc64b02ef9dc58d24d97c Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Wed, 4 Sep 2024 11:15:35 +0800
Subject: [PATCH 127/157] format/tidy

---
 cpp/src/parquet/column_writer_test.cc | 20 +++++++++++---------
 cpp/src/parquet/geometry_util.h       |  3 ++-
 cpp/src/parquet/statistics.cc         | 13 ++++++++-----
 cpp/src/parquet/statistics.h          |  5 +++--
 cpp/src/parquet/thrift_internal.h     |  7 ++++---
 5 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index f873bf57151..402cfb7986e 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -390,7 +390,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
     ApplicationVersion app_version(this->writer_properties_->created_by());
     auto metadata_accessor = ColumnChunkMetaData::Make(
         metadata_->contents(), this->descr_, default_reader_properties(), &app_version);
-    return metadata_accessor->statistics()->Encode();    
+    return metadata_accessor->statistics()->Encode();
   }
 
  protected:
@@ -1725,7 +1725,9 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     for (int i = 0; i < num_columns; ++i) {
       std::string name = TestColumnName(i);
       std::shared_ptr<const LogicalType> logical_type = GeometryLogicalType::Make(
-          CRS, LogicalType::GeometryEdges::PLANAR, LogicalType::GeometryEncoding::WKB, METADATA);
+          CRS, LogicalType::GeometryEdges::PLANAR,
+          LogicalType::GeometryEncoding::WKB,
+          METADATA);
       fields.push_back(schema::PrimitiveNode::Make(name, repetition, logical_type,
                                                    ByteArrayType::type_num));
     }
@@ -1755,7 +1757,7 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
       values_[k].ptr = ptr;
       ptr += point_wkb_size;
     }
-    
+
     values_ptr_ = values_.data();
 
     std::fill(def_levels_.begin(), def_levels_.end(), 1);
@@ -1770,17 +1772,17 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
         this->BuildWriter(num_values, ColumnProperties(), version, data_page_version,
                           /*enable_checksum*/ false);
     std::vector<int16_t> definition_levels(num_values, 0);
-    std::vector<int16_t> repetition_levels(num_values, 0);    
+    std::vector<int16_t> repetition_levels(num_values, 0);
     writer->WriteBatch(this->values_.size(), definition_levels.data(),
                        repetition_levels.data(), this->values_.data());
-    
+
     writer->Close();
     this->ReadColumn();
     for (size_t i = 0; i < num_values; i++) {
       // ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
       const ByteArray &value = this->values_out_[i];
       EXPECT_EQ(21, value.len);
-      EXPECT_EQ(1, value.ptr[0]);      
+      EXPECT_EQ(1, value.ptr[0]);
       uint32_t geom_type = 0;
       double x = 0;
       double y = 0;
@@ -1800,8 +1802,8 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     EXPECT_DOUBLE_EQ(0, geometry_statistics.xmin);
     EXPECT_DOUBLE_EQ(1, geometry_statistics.ymin);
     EXPECT_DOUBLE_EQ(99, geometry_statistics.xmax);
-    EXPECT_DOUBLE_EQ(100, geometry_statistics.ymax);    
-  }  
+    EXPECT_DOUBLE_EQ(100, geometry_statistics.ymax);
+  }
 };
 
 const char* TestGeometryValuesWriter::CRS = R"({"id": {"authority": "OGC", "code": "CRS84"}})";
@@ -1819,7 +1821,7 @@ TEST_F(TestGeometryValuesWriter, TestWriteAndReadV2) {
   for (auto data_page_version :
        {ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) {
     TestWriteAndRead(ParquetVersion::PARQUET_2_4, data_page_version);
-  }  
+  }
 }
 
 }  // namespace test
diff --git a/cpp/src/parquet/geometry_util.h b/cpp/src/parquet/geometry_util.h
index 3cba84016b0..7680378a3ce 100644
--- a/cpp/src/parquet/geometry_util.h
+++ b/cpp/src/parquet/geometry_util.h
@@ -24,6 +24,7 @@
 
 #include "arrow/util/endian.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
 #include "arrow/util/ubsan.h"
 #include "parquet/exception.h"
 
@@ -176,7 +177,7 @@ struct GeometryType {
 
 class WKBBuffer {
  public:
-  WKBBuffer() : data_(nullptr), size_(0) {}
+  WKBBuffer() : data_(NULLPTR), size_(0) {}
   WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {}
 
   void Init(const uint8_t* data, int64_t size) {
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index 81af57f622b..7e4731abb54 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -53,9 +53,9 @@ namespace parquet {
 
 class GeometryStatisticsImpl {
  public:
-  GeometryStatisticsImpl() = default;  
+  GeometryStatisticsImpl() = default;
   GeometryStatisticsImpl(const GeometryStatisticsImpl&) = default;
-  
+
   bool Equals(const GeometryStatisticsImpl& other) const {
     if (is_valid_ != other.is_valid_) {
       return false;
@@ -204,7 +204,8 @@ void GeometryStatistics::Decode(const EncodedGeometryStatistics& encoded) {
 }
 
 std::shared_ptr<GeometryStatistics> GeometryStatistics::clone() const {
-  std::unique_ptr<GeometryStatisticsImpl> impl = std::make_unique<GeometryStatisticsImpl>(*impl_);
+  std::unique_ptr<GeometryStatisticsImpl> impl =
+      std::make_unique<GeometryStatisticsImpl>(*impl_);
   return std::make_shared<GeometryStatistics>(std::move(impl));
 }
 
@@ -797,7 +798,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
   bool HasMinMax() const override { return has_min_max_; }
   bool HasNullCount() const override { return has_null_count_; };
   bool HasGeometryStatistics() const override { return geometry_statistics_ != nullptr; }
-  const GeometryStatistics* geometry_statistics() const override { return geometry_statistics_.get(); }
+  const GeometryStatistics* geometry_statistics() const override {
+    return geometry_statistics_.get();
+  }
 
   void IncrementNullCount(int64_t n) override {
     statistics_.null_count += n;
@@ -1172,7 +1175,7 @@ std::shared_ptr<Comparator> DoMakeComparator(Type::type physical_type,
         physical_type == Type::BYTE_ARRAY) {
       return std::make_shared<TypedComparatorImpl<false, ByteArrayType>>();
     } else {
-      throw ParquetException("UNKNOWN Sort Order");      
+      throw ParquetException("UNKNOWN Sort Order");
     }
   }
   return nullptr;
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index a62c860b891..4e785d6a15f 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -24,6 +24,7 @@
 #include <string>
 #include <utility>
 
+#include "arrow/util/macros.h"
 #include "parquet/platform.h"
 #include "parquet/types.h"
 
@@ -312,9 +313,9 @@ class PARQUET_EXPORT Statistics {
   /// with TypedStatistics<T>::min and max
   virtual bool HasMinMax() const = 0;
 
-  virtual bool HasGeometryStatistics() const { return false; };
+  virtual bool HasGeometryStatistics() const { return false; }
 
-  virtual const GeometryStatistics* geometry_statistics() const { return nullptr; }
+  virtual const GeometryStatistics* geometry_statistics() const { return NULLPTR; }
 
   /// \brief Reset state of object to initial (no data observed) state
   virtual void Reset() = 0;
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index 040f11d5a3d..6e82c3f454f 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -351,7 +351,8 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
   if (stats.has_geometry_statistics) {
     const EncodedGeometryStatistics& encoded_geometry_stats = stats.geometry_statistics();
     format::GeometryStatistics geometry_statistics;
-    std::vector<int32_t> geometry_types(encoded_geometry_stats.geometry_types.begin(), encoded_geometry_stats.geometry_types.end());
+    std::vector<int32_t> geometry_types(encoded_geometry_stats.geometry_types.begin(),
+                                        encoded_geometry_stats.geometry_types.end());
     geometry_statistics.__set_geometry_types(geometry_types);
     format::BoundingBox bbox;
     bbox.__set_xmin(encoded_geometry_stats.xmin);
@@ -364,8 +365,8 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
     }
     if (encoded_geometry_stats.has_m()) {
       bbox.__set_mmin(encoded_geometry_stats.mmin);
-      bbox.__set_mmax(encoded_geometry_stats.mmax);    
-    }   
+      bbox.__set_mmax(encoded_geometry_stats.mmax);
+    }
     geometry_statistics.__set_bbox(bbox);
     statistics.__set_geometry_stats(geometry_statistics);
   }

From b2e0668a7230131a35e228958aa895442f45152a Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Wed, 4 Sep 2024 09:49:28 +0530
Subject: [PATCH 128/157] GH-43902: [Java] Support for Long memory addresses
 (#43903)

### Rationale for this change

The usage of `Integer` instead of `Long` must be encouraged with the usage of memory sizing, indexing and addresses.

### What changes are included in this PR?

This PR refactors the usage of `Integer` into `Long` along with utilities refactors.

### Are these changes tested?

Existing test cases.

### Are there any user-facing changes?

Yes, certain API calls may subject changes.
* GitHub Issue: #43902

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../arrow/memory/AllocationReservation.java   | 32 +++++++++++++++++++
 .../apache/arrow/memory/BaseAllocator.java    | 28 +++++++++++++---
 .../apache/arrow/memory/ChildAllocator.java   |  6 ++--
 .../rounding/DefaultRoundingPolicy.java       | 16 +++++-----
 .../rounding/SegmentRoundingPolicy.java       | 23 ++++++++++++-
 .../arrow/memory/TestBaseAllocator.java       | 14 ++++----
 .../java/io/netty/buffer/NettyArrowBuf.java   | 18 +++++++++--
 .../arrow/memory/AllocatorBenchmarks.java     |  4 +--
 8 files changed, 114 insertions(+), 27 deletions(-)

diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java
index 20066ed14b6..856cc88ab9c 100644
--- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java
+++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java
@@ -34,9 +34,22 @@ public interface AllocationReservation extends AutoCloseable {
    * @param nBytes the number of bytes to add
    * @return true if the addition is possible, false otherwise
    * @throws IllegalStateException if called after buffer() is used to allocate the reservation
+   * @deprecated use {@link #add(long)} instead
    */
+  @Deprecated(forRemoval = true)
   boolean add(int nBytes);
 
+  /**
+   * Add to the current reservation.
+   *
+   * <p>Adding may fail if the allocator is not allowed to consume any more space.
+   *
+   * @param nBytes the number of bytes to add
+   * @return true if the addition is possible, false otherwise
+   * @throws IllegalStateException if called after buffer() is used to allocate the reservation
+   */
+  boolean add(long nBytes);
+
   /**
    * Requests a reservation of additional space.
    *
@@ -44,9 +57,21 @@ public interface AllocationReservation extends AutoCloseable {
    *
    * @param nBytes the amount to reserve
    * @return true if the reservation can be satisfied, false otherwise
+   * @deprecated use {@link #reserve(long)} instead
    */
+  @Deprecated(forRemoval = true)
   boolean reserve(int nBytes);
 
+  /**
+   * Requests a reservation of additional space.
+   *
+   * <p>The implementation of the allocator's inner class provides this.
+   *
+   * @param nBytes the amount to reserve
+   * @return true if the reservation can be satisfied, false otherwise
+   */
+  boolean reserve(long nBytes);
+
   /**
    * Allocate a buffer whose size is the total of all the add()s made.
    *
@@ -65,6 +90,13 @@ public interface AllocationReservation extends AutoCloseable {
    */
   int getSize();
 
+  /**
+   * Get the current size of the reservation (the sum of all the add()s) as a long value.
+   *
+   * @return size of the current reservation
+   */
+  long getSizeLong();
+
   /**
    * Return whether or not the reservation has been used.
    *
diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java
index dd6375e910b..20a89d0b7bf 100644
--- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java
+++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java
@@ -29,6 +29,7 @@
 import org.apache.arrow.memory.util.AssertionUtil;
 import org.apache.arrow.memory.util.CommonUtil;
 import org.apache.arrow.memory.util.HistoricalLog;
+import org.apache.arrow.memory.util.LargeMemoryUtil;
 import org.apache.arrow.util.Preconditions;
 import org.checkerframework.checker.initialization.qual.Initialized;
 import org.checkerframework.checker.nullness.qual.KeyFor;
@@ -860,7 +861,7 @@ RoundingPolicy getRoundingPolicy() {
   public class Reservation implements AllocationReservation {
 
     private final @Nullable HistoricalLog historicalLog;
-    private int nBytes = 0;
+    private long nBytes = 0;
     private boolean used = false;
     private boolean closed = false;
 
@@ -888,8 +889,15 @@ public Reservation() {
       }
     }
 
+    @SuppressWarnings({"removal", "InlineMeSuggester"})
+    @Deprecated(forRemoval = true)
     @Override
     public boolean add(final int nBytes) {
+      return add((long) nBytes);
+    }
+
+    @Override
+    public boolean add(final long nBytes) {
       assertOpen();
 
       Preconditions.checkArgument(nBytes >= 0, "nBytes(%d) < 0", nBytes);
@@ -906,7 +914,7 @@ public boolean add(final int nBytes) {
       // modifying this behavior so that we maintain what we reserve and what the user asked for
       // and make sure to only
       // round to power of two as necessary.
-      final int nBytesTwo = CommonUtil.nextPowerOfTwo(nBytes);
+      final long nBytesTwo = CommonUtil.nextPowerOfTwo(nBytes);
       if (!reserve(nBytesTwo)) {
         return false;
       }
@@ -929,6 +937,11 @@ public ArrowBuf allocateBuffer() {
 
     @Override
     public int getSize() {
+      return LargeMemoryUtil.checkedCastToInt(nBytes);
+    }
+
+    @Override
+    public long getSizeLong() {
       return nBytes;
     }
 
@@ -978,8 +991,15 @@ public void close() {
       closed = true;
     }
 
+    @SuppressWarnings({"removal", "InlineMeSuggester"})
+    @Deprecated(forRemoval = true)
     @Override
     public boolean reserve(int nBytes) {
+      return reserve((long) nBytes);
+    }
+
+    @Override
+    public boolean reserve(long nBytes) {
       assertOpen();
 
       final AllocationOutcome outcome = BaseAllocator.this.allocateBytes(nBytes);
@@ -999,7 +1019,7 @@ public boolean reserve(int nBytes) {
      * @param nBytes the size of the buffer requested
      * @return the buffer, or null, if the request cannot be satisfied
      */
-    private ArrowBuf allocate(int nBytes) {
+    private ArrowBuf allocate(long nBytes) {
       assertOpen();
 
       boolean success = false;
@@ -1033,7 +1053,7 @@ private ArrowBuf allocate(int nBytes) {
      *
      * @param nBytes the size of the reservation
      */
-    private void releaseReservation(int nBytes) {
+    private void releaseReservation(long nBytes) {
       assertOpen();
 
       releaseBytes(nBytes);
diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ChildAllocator.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ChildAllocator.java
index f8dd7e1d1cb..50f33d3f021 100644
--- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ChildAllocator.java
+++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ChildAllocator.java
@@ -17,9 +17,9 @@
 package org.apache.arrow.memory;
 
 /**
- * Child allocator class. Only slightly different from the {@see RootAllocator}, in that these can't
- * be created directly, but must be obtained from {@link BufferAllocator#newChildAllocator(String,
- * AllocationListener, long, long)}.
+ * Child allocator class. Only slightly different from the {@link RootAllocator}, in that these
+ * can't be created directly, but must be obtained from {@link
+ * BufferAllocator#newChildAllocator(String, AllocationListener, long, long)}.
  *
  * <p>Child allocators can only be created by the root, or other children, so this class is package
  * private.
diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/DefaultRoundingPolicy.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/DefaultRoundingPolicy.java
index 289b10634d8..90e8a1d5eca 100644
--- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/DefaultRoundingPolicy.java
+++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/DefaultRoundingPolicy.java
@@ -34,13 +34,13 @@ public class DefaultRoundingPolicy implements RoundingPolicy {
    *
    * <p>It was copied from {@link io.netty.buffer.PooledByteBufAllocator}.
    */
-  private static final int MIN_PAGE_SIZE = 4096;
+  private static final long MIN_PAGE_SIZE = 4096;
 
-  private static final int MAX_CHUNK_SIZE = (int) (((long) Integer.MAX_VALUE + 1) / 2);
+  private static final long MAX_CHUNK_SIZE = ((long) Integer.MAX_VALUE + 1) / 2;
   private static final long DEFAULT_CHUNK_SIZE;
 
   static {
-    int defaultPageSize = Integer.getInteger("org.apache.memory.allocator.pageSize", 8192);
+    long defaultPageSize = Long.getLong("org.apache.memory.allocator.pageSize", 8192);
     try {
       validateAndCalculatePageShifts(defaultPageSize);
     } catch (Throwable t) {
@@ -60,7 +60,7 @@ public class DefaultRoundingPolicy implements RoundingPolicy {
     }
   }
 
-  private static int validateAndCalculatePageShifts(int pageSize) {
+  private static long validateAndCalculatePageShifts(long pageSize) {
     if (pageSize < MIN_PAGE_SIZE) {
       throw new IllegalArgumentException(
           "pageSize: " + pageSize + " (expected: " + MIN_PAGE_SIZE + ")");
@@ -71,17 +71,17 @@ private static int validateAndCalculatePageShifts(int pageSize) {
     }
 
     // Logarithm base 2. At this point we know that pageSize is a power of two.
-    return Integer.SIZE - 1 - Integer.numberOfLeadingZeros(pageSize);
+    return Long.SIZE - 1L - Long.numberOfLeadingZeros(pageSize);
   }
 
-  private static int validateAndCalculateChunkSize(int pageSize, int maxOrder) {
+  private static long validateAndCalculateChunkSize(long pageSize, int maxOrder) {
     if (maxOrder > 14) {
       throw new IllegalArgumentException("maxOrder: " + maxOrder + " (expected: 0-14)");
     }
 
     // Ensure the resulting chunkSize does not overflow.
-    int chunkSize = pageSize;
-    for (int i = maxOrder; i > 0; i--) {
+    long chunkSize = pageSize;
+    for (long i = maxOrder; i > 0; i--) {
       if (chunkSize > MAX_CHUNK_SIZE / 2) {
         throw new IllegalArgumentException(
             String.format(
diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/SegmentRoundingPolicy.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/SegmentRoundingPolicy.java
index f501cfedd16..89db736e6a0 100644
--- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/SegmentRoundingPolicy.java
+++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/SegmentRoundingPolicy.java
@@ -16,6 +16,8 @@
  */
 package org.apache.arrow.memory.rounding;
 
+import com.google.errorprone.annotations.InlineMe;
+import org.apache.arrow.memory.util.LargeMemoryUtil;
 import org.apache.arrow.util.Preconditions;
 
 /** The rounding policy that each buffer size must a multiple of the segment size. */
@@ -28,7 +30,7 @@ public class SegmentRoundingPolicy implements RoundingPolicy {
    * The segment size. It must be at least {@link SegmentRoundingPolicy#MIN_SEGMENT_SIZE}, and be a
    * power of 2.
    */
-  private int segmentSize;
+  private long segmentSize;
 
   /**
    * Constructor for the segment rounding policy.
@@ -36,8 +38,22 @@ public class SegmentRoundingPolicy implements RoundingPolicy {
    * @param segmentSize the segment size.
    * @throws IllegalArgumentException if the segment size is smaller than {@link
    *     SegmentRoundingPolicy#MIN_SEGMENT_SIZE}, or is not a power of 2.
+   * @deprecated use {@link SegmentRoundingPolicy#SegmentRoundingPolicy(long)} instead.
    */
+  @Deprecated(forRemoval = true)
+  @InlineMe(replacement = "this((long) segmentSize)")
   public SegmentRoundingPolicy(int segmentSize) {
+    this((long) segmentSize);
+  }
+
+  /**
+   * Constructor for the segment rounding policy.
+   *
+   * @param segmentSize the segment size.
+   * @throws IllegalArgumentException if the segment size is smaller than {@link
+   *     SegmentRoundingPolicy#MIN_SEGMENT_SIZE}, or is not a power of 2.
+   */
+  public SegmentRoundingPolicy(long segmentSize) {
     Preconditions.checkArgument(
         segmentSize >= MIN_SEGMENT_SIZE,
         "The segment size cannot be smaller than %s",
@@ -52,7 +68,12 @@ public long getRoundedSize(long requestSize) {
     return (requestSize + (segmentSize - 1)) / segmentSize * segmentSize;
   }
 
+  @Deprecated(forRemoval = true)
   public int getSegmentSize() {
+    return LargeMemoryUtil.checkedCastToInt(segmentSize);
+  }
+
+  public long getSegmentSizeAsLong() {
     return segmentSize;
   }
 }
diff --git a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java
index a5fbc67c48f..87e9316964d 100644
--- a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java
+++ b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java
@@ -315,7 +315,7 @@ public void testRootAllocator_createChildDontClose() throws Exception {
 
   @Test
   public void testSegmentAllocator() {
-    RoundingPolicy policy = new SegmentRoundingPolicy(1024);
+    RoundingPolicy policy = new SegmentRoundingPolicy(1024L);
     try (RootAllocator allocator =
         new RootAllocator(AllocationListener.NOOP, 1024 * 1024, policy)) {
       ArrowBuf buf = allocator.buffer(798);
@@ -334,7 +334,7 @@ public void testSegmentAllocator() {
 
   @Test
   public void testSegmentAllocator_childAllocator() {
-    RoundingPolicy policy = new SegmentRoundingPolicy(1024);
+    RoundingPolicy policy = new SegmentRoundingPolicy(1024L);
     try (RootAllocator allocator = new RootAllocator(AllocationListener.NOOP, 1024 * 1024, policy);
         BufferAllocator childAllocator = allocator.newChildAllocator("child", 0, 512 * 1024)) {
 
@@ -357,14 +357,14 @@ public void testSegmentAllocator_childAllocator() {
   @Test
   public void testSegmentAllocator_smallSegment() {
     IllegalArgumentException e =
-        assertThrows(IllegalArgumentException.class, () -> new SegmentRoundingPolicy(128));
+        assertThrows(IllegalArgumentException.class, () -> new SegmentRoundingPolicy(128L));
     assertEquals("The segment size cannot be smaller than 1024", e.getMessage());
   }
 
   @Test
   public void testSegmentAllocator_segmentSizeNotPowerOf2() {
     IllegalArgumentException e =
-        assertThrows(IllegalArgumentException.class, () -> new SegmentRoundingPolicy(4097));
+        assertThrows(IllegalArgumentException.class, () -> new SegmentRoundingPolicy(4097L));
     assertEquals("The segment size must be a power of 2", e.getMessage());
   }
 
@@ -957,7 +957,7 @@ public void testAllocator_unclaimedReservation() throws Exception {
       try (final BufferAllocator childAllocator1 =
           rootAllocator.newChildAllocator("unclaimedReservation", 0, MAX_ALLOCATION)) {
         try (final AllocationReservation reservation = childAllocator1.newReservation()) {
-          assertTrue(reservation.add(64));
+          assertTrue(reservation.add(64L));
         }
         rootAllocator.verify();
       }
@@ -972,8 +972,8 @@ public void testAllocator_claimedReservation() throws Exception {
           rootAllocator.newChildAllocator("claimedReservation", 0, MAX_ALLOCATION)) {
 
         try (final AllocationReservation reservation = childAllocator1.newReservation()) {
-          assertTrue(reservation.add(32));
-          assertTrue(reservation.add(32));
+          assertTrue(reservation.add(32L));
+          assertTrue(reservation.add(32L));
 
           final ArrowBuf arrowBuf = reservation.allocateBuffer();
           assertEquals(64, arrowBuf.capacity());
diff --git a/java/memory/memory-netty-buffer-patch/src/main/java/io/netty/buffer/NettyArrowBuf.java b/java/memory/memory-netty-buffer-patch/src/main/java/io/netty/buffer/NettyArrowBuf.java
index bdad3700cb3..9319d15aaa9 100644
--- a/java/memory/memory-netty-buffer-patch/src/main/java/io/netty/buffer/NettyArrowBuf.java
+++ b/java/memory/memory-netty-buffer-patch/src/main/java/io/netty/buffer/NettyArrowBuf.java
@@ -38,7 +38,7 @@ public class NettyArrowBuf extends AbstractByteBuf implements AutoCloseable {
 
   private final ArrowBuf arrowBuf;
   private final ArrowByteBufAllocator arrowByteBufAllocator;
-  private int length;
+  private long length;
   private final long address;
 
   /**
@@ -47,10 +47,24 @@ public class NettyArrowBuf extends AbstractByteBuf implements AutoCloseable {
    * @param arrowBuf The buffer to wrap.
    * @param bufferAllocator The allocator for the buffer.
    * @param length The length of this buffer.
+   * @deprecated Use {@link #NettyArrowBuf(ArrowBuf, BufferAllocator, long)} instead.
    */
+  @Deprecated(forRemoval = true)
   public NettyArrowBuf(
       final ArrowBuf arrowBuf, final BufferAllocator bufferAllocator, final int length) {
-    super(length);
+    this(arrowBuf, bufferAllocator, (long) length);
+  }
+
+  /**
+   * Constructs a new instance.
+   *
+   * @param arrowBuf The buffer to wrap.
+   * @param bufferAllocator The allocator for the buffer.
+   * @param length The length of this buffer.
+   */
+  public NettyArrowBuf(
+      final ArrowBuf arrowBuf, final BufferAllocator bufferAllocator, final long length) {
+    super((int) length);
     this.arrowBuf = arrowBuf;
     this.arrowByteBufAllocator = new ArrowByteBufAllocator(bufferAllocator);
     this.length = length;
diff --git a/java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java
index f275090aae6..1154809cae7 100644
--- a/java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java
+++ b/java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java
@@ -57,9 +57,9 @@ public void defaultAllocatorBenchmark() {
   @BenchmarkMode(Mode.AverageTime)
   @OutputTimeUnit(TimeUnit.MICROSECONDS)
   public void segmentRoundingPolicyBenchmark() {
-    final int bufferSize = 1024;
+    final long bufferSize = 1024L;
     final int numBuffers = 1024;
-    final int segmentSize = 1024;
+    final long segmentSize = 1024L;
 
     RoundingPolicy policy = new SegmentRoundingPolicy(segmentSize);
     try (RootAllocator allocator =

From f6ae9aec4d1d0f8fe08beb8a80aff5559e044518 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Wed, 4 Sep 2024 13:14:40 +0800
Subject: [PATCH 129/157] Run clang-tidy

---
 cpp/src/parquet/column_writer_test.cc | 19 +++++++++----------
 cpp/src/parquet/geometry_util.h       |  6 +++---
 cpp/src/parquet/statistics.cc         |  3 +--
 cpp/src/parquet/test_util.h           |  4 +---
 cpp/src/parquet/types.cc              |  7 +++----
 5 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index 402cfb7986e..7916f9ce70f 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -1716,7 +1716,7 @@ TEST(TestColumnWriter, WriteDataPageV2HeaderNullCount) {
 // Test writing and reading geometry columns
 class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
  public:
-  static const char *CRS;
+  static const char* CRS;
   static const char* METADATA;
 
   void SetUpSchema(Repetition::type repetition, int num_columns) override {
@@ -1724,10 +1724,9 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
 
     for (int i = 0; i < num_columns; ++i) {
       std::string name = TestColumnName(i);
-      std::shared_ptr<const LogicalType> logical_type = GeometryLogicalType::Make(
-          CRS, LogicalType::GeometryEdges::PLANAR,
-          LogicalType::GeometryEncoding::WKB,
-          METADATA);
+      std::shared_ptr<const LogicalType> logical_type =
+          GeometryLogicalType::Make(CRS, LogicalType::GeometryEdges::PLANAR,
+                                    LogicalType::GeometryEncoding::WKB, METADATA);
       fields.push_back(schema::PrimitiveNode::Make(name, repetition, logical_type,
                                                    ByteArrayType::type_num));
     }
@@ -1741,10 +1740,10 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
 
     uint32_t point_wkb_size = 21;
     buffer_.resize(num_values * point_wkb_size);
-    uint8_t *ptr = buffer_.data();
+    uint8_t* ptr = buffer_.data();
     for (int k = 0; k < num_values; k++) {
       // Point with coordinates (k, k + 1), encoded as WKB
-      ptr[0] = 0x01;  // 1: little endian
+      ptr[0] = 0x01;           // 1: little endian
       uint32_t geom_type = 1;  // 1: POINT (2D)
       memcpy(&ptr[1], &geom_type, 4);
       double x = k;
@@ -1780,7 +1779,7 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     this->ReadColumn();
     for (size_t i = 0; i < num_values; i++) {
       // ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
-      const ByteArray &value = this->values_out_[i];
+      const ByteArray& value = this->values_out_[i];
       EXPECT_EQ(21, value.len);
       EXPECT_EQ(1, value.ptr[0]);
       uint32_t geom_type = 0;
@@ -1806,10 +1805,10 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
   }
 };
 
-const char* TestGeometryValuesWriter::CRS = R"({"id": {"authority": "OGC", "code": "CRS84"}})";
+const char* TestGeometryValuesWriter::CRS =
+    R"({"id": {"authority": "OGC", "code": "CRS84"}})";
 const char* TestGeometryValuesWriter::METADATA = "test_metadata";
 
-
 TEST_F(TestGeometryValuesWriter, TestWriteAndReadV1) {
   for (auto data_page_version :
        {ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) {
diff --git a/cpp/src/parquet/geometry_util.h b/cpp/src/parquet/geometry_util.h
index 7680378a3ce..4ba77c822d7 100644
--- a/cpp/src/parquet/geometry_util.h
+++ b/cpp/src/parquet/geometry_util.h
@@ -333,7 +333,7 @@ template <Dimensions::dimensions dims, bool swap, uint32_t chunk_size>
 class WKBSequenceBounder {
  public:
   explicit WKBSequenceBounder(double* chunk) : box_(dims), chunk_(chunk) {}
-  WKBSequenceBounder(const WKBSequenceBounder &) = default;
+  WKBSequenceBounder(const WKBSequenceBounder&) = default;
 
   void ReadPoint(WKBBuffer* src) {
     constexpr uint32_t coord_size = Dimensions::size<dims>();
@@ -400,7 +400,7 @@ class WKBGenericSequenceBounder {
         xyz_swap_(chunk_),
         xym_swap_(chunk_),
         xyzm_swap_(chunk_) {}
-  WKBGenericSequenceBounder(const WKBGenericSequenceBounder &) = default;
+  WKBGenericSequenceBounder(const WKBGenericSequenceBounder&) = default;
 
   void ReadPoint(WKBBuffer* src, Dimensions::dimensions dimensions, bool swap) {
     if (ARROW_PREDICT_TRUE(!swap)) {
@@ -541,7 +541,7 @@ class WKBGenericSequenceBounder {
 class WKBGeometryBounder {
  public:
   WKBGeometryBounder() : box_(Dimensions::XYZM) {}
-  WKBGeometryBounder(const WKBGeometryBounder &) = default;
+  WKBGeometryBounder(const WKBGeometryBounder&) = default;
 
   void ReadGeometry(WKBBuffer* src, bool record_wkb_type = true) {
     uint8_t endian = src->ReadUInt8();
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index 7e4731abb54..af40838e8db 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -179,8 +179,7 @@ GeometryStatistics::GeometryStatistics() {
 }
 
 GeometryStatistics::GeometryStatistics(std::unique_ptr<GeometryStatisticsImpl> impl)
-    : impl_(std::move(impl)) {
-}
+    : impl_(std::move(impl)) {}
 
 bool GeometryStatistics::Equals(const GeometryStatistics& other) const {
   return impl_->Equals(*other.impl_);
diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h
index 4ce6b5295f4..d22cea7b431 100644
--- a/cpp/src/parquet/test_util.h
+++ b/cpp/src/parquet/test_util.h
@@ -672,9 +672,7 @@ class PrimitiveTypedTest : public ::testing::Test {
     schema_.Init(node_);
   }
 
-  void SetUpSchema(Repetition::type repetition) {
-    this->SetUpSchema(repetition, 1);
-  }
+  void SetUpSchema(Repetition::type repetition) { this->SetUpSchema(repetition, 1); }
 
   void GenerateData(int64_t num_values, uint32_t seed = 0);
   void SetupValuesOut(int64_t num_values);
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index 6707fb78ed4..db8400a80c0 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -1738,10 +1738,9 @@ format::LogicalType LogicalType::Impl::Geometry::ToThrift() const {
 bool LogicalType::Impl::Geometry::Equals(const LogicalType& other) const {
   if (other.is_geometry()) {
     const auto& other_geometry = checked_cast<const GeometryLogicalType&>(other);
-    return crs() == other_geometry.crs() &&
-      edges() == other_geometry.edges() &&
-      encoding() == other_geometry.encoding() &&
-      metadata() == other_geometry.metadata();
+    return crs() == other_geometry.crs() && edges() == other_geometry.edges() &&
+           encoding() == other_geometry.encoding() &&
+           metadata() == other_geometry.metadata();
   } else {
     return false;
   }

From 2fac18587ecd4f027cef0a5a8a1905af41c4ce48 Mon Sep 17 00:00:00 2001
From: Dane Pitkin <dpitkin@apache.org>
Date: Wed, 4 Sep 2024 03:46:09 -0400
Subject: [PATCH 130/157] GH-43727: [Python] RecordBatch fails gracefully on
 non-cpu devices (#43729)

### Rationale for this change

Throw a python exception if a RecordBatch API isn't able to be used when the memory is backed by non-cpu devices.

### What changes are included in this PR?

* Assert the device is CPU for APIs that only support CPU

### Are these changes tested?

Pytests

### Are there any user-facing changes?

The user experiences Python exceptions instead of segfaults for unsupported APIs.
* GitHub Issue: #43727

Authored-by: Dane Pitkin <dpitkin@apache.org>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/.gitignore                  |   1 +
 python/pyarrow/lib.pxd             |   2 +-
 python/pyarrow/table.pxi           |  39 ++++++
 python/pyarrow/tests/test_table.py | 200 +++++++++++++++++++++++++++++
 4 files changed, 241 insertions(+), 1 deletion(-)

diff --git a/python/.gitignore b/python/.gitignore
index ce7f0654127..fbc3b192433 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -17,6 +17,7 @@ Testing/
 *.cpp
 pyarrow/lib.h
 pyarrow/*_api.h
+pyarrow/_cuda.h
 pyarrow/_generated_version.py
 cython_debug
 
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 5c3d981c3ad..ad05ea31c91 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -516,7 +516,7 @@ cdef class ChunkedArray(_PandasConvertible):
 
 
 cdef class _Tabular(_PandasConvertible):
-    pass
+    cdef void _assert_cpu(self) except *
 
 
 cdef class Table(_Tabular):
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index fff47373cb9..9bb86236659 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -1574,6 +1574,7 @@ cdef class _Tabular(_PandasConvertible):
                         f"one of the `{self.__class__.__name__}.from_*` functions instead.")
 
     def __array__(self, dtype=None, copy=None):
+        self._assert_cpu()
         if copy is False:
             raise ValueError(
                 "Unable to avoid a copy while creating a numpy array as requested "
@@ -1827,6 +1828,7 @@ cdef class _Tabular(_PandasConvertible):
         n_legs: [[4,100]]
         animals: [["Horse","Centipede"]]
         """
+        self._assert_cpu()
         return _pc().drop_null(self)
 
     def field(self, i):
@@ -2088,6 +2090,7 @@ cdef class _Tabular(_PandasConvertible):
         n_legs: [[5,100,4,2,4,2]]
         animal: [["Brittle stars","Centipede","Dog","Flamingo","Horse","Parrot"]]
         """
+        self._assert_cpu()
         if isinstance(sorting, str):
             sorting = [(sorting, "ascending")]
 
@@ -2133,6 +2136,7 @@ cdef class _Tabular(_PandasConvertible):
         n_legs: [[4,100]]
         animals: [["Horse","Centipede"]]
         """
+        self._assert_cpu()
         return _pc().take(self, indices)
 
     def filter(self, mask, object null_selection_behavior="drop"):
@@ -2202,6 +2206,7 @@ cdef class _Tabular(_PandasConvertible):
         n_legs: [[2,4,null]]
         animals: [["Flamingo","Horse",null]]
         """
+        self._assert_cpu()
         if isinstance(mask, _pc().Expression):
             return _pac()._filter_table(self, mask)
         else:
@@ -2402,6 +2407,9 @@ cdef class _Tabular(_PandasConvertible):
         """
         return self.add_column(self.num_columns, field_, column)
 
+    cdef void _assert_cpu(self) except *:
+        return
+
 
 cdef class RecordBatch(_Tabular):
     """
@@ -2512,6 +2520,7 @@ cdef class RecordBatch(_Tabular):
         return self.batch != NULL
 
     def __reduce__(self):
+        self._assert_cpu()
         return _reconstruct_record_batch, (self.columns, self.schema)
 
     def validate(self, *, full=False):
@@ -2531,6 +2540,7 @@ cdef class RecordBatch(_Tabular):
         ArrowInvalid
         """
         if full:
+            self._assert_cpu()
             with nogil:
                 check_status(self.batch.ValidateFull())
         else:
@@ -2697,6 +2707,7 @@ cdef class RecordBatch(_Tabular):
         >>> batch.nbytes
         116
         """
+        self._assert_cpu()
         cdef:
             CResult[int64_t] c_res_buffer
 
@@ -2726,6 +2737,7 @@ cdef class RecordBatch(_Tabular):
         >>> batch.get_total_buffer_size()
         120
         """
+        self._assert_cpu()
         cdef:
             int64_t total_buffer_size
 
@@ -2792,12 +2804,19 @@ cdef class RecordBatch(_Tabular):
             shared_ptr[CRecordBatch] c_batch
             Field c_field
             Array c_arr
+            CDeviceAllocationType device_type = self.sp_batch.get().device_type()
 
         if isinstance(column, Array):
             c_arr = column
         else:
             c_arr = array(column)
 
+        if device_type != c_arr.sp_array.get().device_type():
+            raise TypeError("The column must be allocated on the same "
+                            "device as the RecordBatch. Got column on "
+                            f"device {c_arr.device_type!r}, but expected "
+                            f"{self.device_type!r}.")
+
         if isinstance(field_, Field):
             c_field = field_
         else:
@@ -2885,12 +2904,19 @@ cdef class RecordBatch(_Tabular):
             shared_ptr[CRecordBatch] c_batch
             Field c_field
             Array c_arr
+            CDeviceAllocationType device_type = self.sp_batch.get().device_type()
 
         if isinstance(column, Array):
             c_arr = column
         else:
             c_arr = array(column)
 
+        if device_type != c_arr.sp_array.get().device_type():
+            raise TypeError("The column must be allocated on the same "
+                            "device as the RecordBatch. Got column on "
+                            f"device {c_arr.device_type!r}, but expected "
+                            f"{self.device_type!r}.")
+
         if isinstance(field_, Field):
             c_field = field_
         else:
@@ -3016,6 +3042,7 @@ cdef class RecordBatch(_Tabular):
         n_legs: [2,2,4,4,5,100]
         animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]
         """
+        self._assert_cpu()
         cdef shared_ptr[CBuffer] buffer
         cdef CIpcWriteOptions options = CIpcWriteOptions.Defaults()
         options.memory_pool = maybe_unbox_memory_pool(memory_pool)
@@ -3117,6 +3144,7 @@ cdef class RecordBatch(_Tabular):
         >>> batch.equals(batch_1, check_metadata=True)
         False
         """
+        self._assert_cpu()
         cdef:
             CRecordBatch* this_batch = self.batch
             shared_ptr[CRecordBatch] other_batch = pyarrow_unwrap_batch(other)
@@ -3248,6 +3276,7 @@ cdef class RecordBatch(_Tabular):
         return RecordBatch.from_arrays(newcols, schema=target_schema)
 
     def _to_pandas(self, options, **kwargs):
+        self._assert_cpu()
         return Table.from_batches([self])._to_pandas(options, **kwargs)
 
     @classmethod
@@ -3473,6 +3502,8 @@ cdef class RecordBatch(_Tabular):
         """
         cdef:
             shared_ptr[CRecordBatch] c_record_batch
+        if struct_array.sp_array.get().device_type() != CDeviceAllocationType_kCPU:
+            raise NotImplementedError("Implemented only for data on CPU device")
         with nogil:
             c_record_batch = GetResultValue(
                 CRecordBatch.FromStructArray(struct_array.sp_array))
@@ -3482,6 +3513,7 @@ cdef class RecordBatch(_Tabular):
         """
         Convert to a struct array.
         """
+        self._assert_cpu()
         cdef:
             shared_ptr[CRecordBatch] c_record_batch
             shared_ptr[CArray] c_array
@@ -3560,6 +3592,7 @@ cdef class RecordBatch(_Tabular):
                [ 4., 40.],
                [nan, nan]])
         """
+        self._assert_cpu()
         cdef:
             shared_ptr[CRecordBatch] c_record_batch
             shared_ptr[CTensor] c_tensor
@@ -3686,6 +3719,7 @@ cdef class RecordBatch(_Tabular):
             A pair of PyCapsules containing a C ArrowSchema and ArrowArray,
             respectively.
         """
+        self._assert_cpu()
         cdef:
             ArrowArray* c_array
             ArrowSchema* c_schema
@@ -3731,6 +3765,7 @@ cdef class RecordBatch(_Tabular):
         -------
         PyCapsule
         """
+        self._assert_cpu()
         return Table.from_batches([self]).__arrow_c_stream__(requested_schema)
 
     @staticmethod
@@ -3943,6 +3978,10 @@ cdef class RecordBatch(_Tabular):
         """
         return self.device_type == DeviceAllocationType.CPU
 
+    cdef void _assert_cpu(self) except *:
+        if self.sp_batch.get().device_type() != CDeviceAllocationType_kCPU:
+            raise NotImplementedError("Implemented only for data on CPU device")
+
 
 def _reconstruct_record_batch(columns, schema):
     """
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 3b60cff2d8c..57765985505 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -27,6 +27,7 @@
 import pytest
 import pyarrow as pa
 import pyarrow.compute as pc
+from pyarrow.interchange import from_dataframe
 from pyarrow.vendored.version import Version
 
 
@@ -3374,3 +3375,202 @@ def test_invalid_non_join_column():
     with pytest.raises(pa.lib.ArrowInvalid) as excinfo:
         t2.join(t1, 'id', join_type='inner')
     assert exp_error_msg in str(excinfo.value)
+
+
+@pytest.fixture
+def cuda_context():
+    cuda = pytest.importorskip("pyarrow.cuda")
+    return cuda.Context(0)
+
+
+@pytest.fixture
+def schema():
+    return pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())])
+
+
+@pytest.fixture
+def cpu_arrays():
+    return [pa.array([1, 2, 3, 4, 5], pa.int16()),
+            pa.array([-10, -5, 0, 1, 10], pa.int32())]
+
+
+@pytest.fixture
+def cuda_arrays(cuda_context, cpu_arrays):
+    return [arr.copy_to(cuda_context.memory_manager) for arr in cpu_arrays]
+
+
+@pytest.fixture
+def cpu_recordbatch(cpu_arrays, schema):
+    return pa.record_batch(cpu_arrays, schema=schema)
+
+
+@pytest.fixture
+def cuda_recordbatch(cuda_context, cpu_recordbatch):
+    return cpu_recordbatch.copy_to(cuda_context.memory_manager)
+
+
+def verify_cuda_recordbatch(batch, expected_schema):
+    batch.validate()
+    assert batch.device_type == pa.DeviceAllocationType.CUDA
+    assert batch.is_cpu is False
+    assert batch.num_columns == len(expected_schema.names)
+    assert batch.column_names == expected_schema.names
+    assert str(batch) in repr(batch)
+    for c in batch.columns:
+        assert c.device_type == pa.DeviceAllocationType.CUDA
+    assert batch.schema == expected_schema
+
+
+def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
+                             cuda_arrays, schema):
+    verify_cuda_recordbatch(cuda_recordbatch, expected_schema=schema)
+    assert cuda_recordbatch.shape == (5, 2)
+
+    # columns() test
+    assert len(cuda_recordbatch.columns) == 2
+
+    # add_column(), set_column() test
+    for fn in [cuda_recordbatch.add_column, cuda_recordbatch.set_column]:
+        col = pa.array([6, 7, 8, 9, 10], pa.int8()).copy_to(cuda_context.memory_manager)
+        new_batch = fn(2, 'c2', col)
+        assert len(new_batch.columns) == 3
+        for c in new_batch.columns:
+            assert c.device_type == pa.DeviceAllocationType.CUDA
+        err_msg = ("Got column on device <DeviceAllocationType.CPU: 1>, "
+                   "but expected <DeviceAllocationType.CUDA: 2>.")
+        with pytest.raises(TypeError, match=err_msg):
+            fn(2, 'c2', [1, 1, 1, 1, 1])
+
+    # remove_column() test
+    new_batch = cuda_recordbatch.remove_column(1)
+    verify_cuda_recordbatch(new_batch, expected_schema=schema.remove(1))
+
+    # drop_columns() test
+    new_batch = cuda_recordbatch.drop_columns(['c0', 'c1'])
+    assert len(new_batch.columns) == 0
+    assert new_batch.device_type == pa.DeviceAllocationType.CUDA
+
+    # select() test
+    new_batch = cuda_recordbatch.select(['c0'])
+    verify_cuda_recordbatch(new_batch, expected_schema=schema.remove(1))
+
+    # cast() test
+    new_schema = pa.schema([pa.field('c0', pa.int64()), pa.field('c1', pa.int64())])
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.cast(new_schema)
+
+    # drop_null() test
+    null_col = pa.array([-2, -1, 0, 1, 2],
+                        mask=[True, False, True, False, True]).copy_to(
+        cuda_context.memory_manager)
+    cuda_recordbatch_with_nulls = cuda_recordbatch.add_column(2, 'c2', null_col)
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch_with_nulls.drop_null()
+
+    # filter() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.filter([True] * 5)
+
+    # take() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.take([0])
+
+    # sort_by() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.sort_by('c0')
+
+    # field() test
+    assert cuda_recordbatch.field(0) == pa.field('c0', pa.int16())
+    assert cuda_recordbatch.field(1) == pa.field('c1', pa.int32())
+
+    # equals() test
+    new_batch = cpu_recordbatch.copy_to(cuda_context.memory_manager)
+    with pytest.raises(NotImplementedError):
+        assert cuda_recordbatch.equals(new_batch) is True
+
+    # from_arrays() test
+    new_batch = pa.RecordBatch.from_arrays(cuda_arrays, ['c0', 'c1'])
+    verify_cuda_recordbatch(new_batch, expected_schema=schema)
+    assert new_batch.copy_to(pa.default_cpu_memory_manager()).equals(cpu_recordbatch)
+
+    # from_pydict() test
+    new_batch = pa.RecordBatch.from_pydict({'c0': cuda_arrays[0], 'c1': cuda_arrays[1]})
+    verify_cuda_recordbatch(new_batch, expected_schema=schema)
+    assert new_batch.copy_to(pa.default_cpu_memory_manager()).equals(cpu_recordbatch)
+
+    # from_struct_array() test
+    fields = [schema.field(i) for i in range(len(schema.names))]
+    struct_array = pa.StructArray.from_arrays(cuda_arrays, fields=fields)
+    with pytest.raises(NotImplementedError):
+        pa.RecordBatch.from_struct_array(struct_array)
+
+    # nbytes test
+    with pytest.raises(NotImplementedError):
+        assert cuda_recordbatch.nbytes
+
+    # get_total_buffer_size() test
+    with pytest.raises(NotImplementedError):
+        assert cuda_recordbatch.get_total_buffer_size()
+
+    # to_pydict() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.to_pydict()
+
+    # to_pylist() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.to_pylist()
+
+    # to_pandas() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.to_pandas()
+
+    # to_tensor() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.to_tensor()
+
+    # to_struct_array() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.to_struct_array()
+
+    # serialize() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.serialize()
+
+    # slice() test
+    new_batch = cuda_recordbatch.slice(1, 3)
+    verify_cuda_recordbatch(new_batch, expected_schema=schema)
+    assert new_batch.num_rows == 3
+    cpu_batch = new_batch.copy_to(pa.default_cpu_memory_manager())
+    assert cpu_batch == cpu_recordbatch.slice(1, 3)
+
+    # replace_schema_metadata() test
+    new_batch = cuda_recordbatch.replace_schema_metadata({b'key': b'value'})
+    verify_cuda_recordbatch(new_batch, expected_schema=schema)
+    assert new_batch.schema.metadata == {b'key': b'value'}
+
+    # rename_columns() test
+    new_batch = cuda_recordbatch.rename_columns(['col0', 'col1'])
+    expected_schema = pa.schema(
+        [pa.field('col0', pa.int16()), pa.field('col1', pa.int32())])
+    verify_cuda_recordbatch(new_batch, expected_schema=expected_schema)
+
+    # validate() test
+    cuda_recordbatch.validate()
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.validate(full=True)
+
+    # __array__() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.__array__()
+
+    # __arrow_c_array__() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.__arrow_c_array__()
+
+    # __arrow_c_stream__() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.__arrow_c_stream__()
+
+    # __dataframe__() test
+    with pytest.raises(NotImplementedError):
+        from_dataframe(cuda_recordbatch.__dataframe__())

From 9445fe4b13c31ddefdb104fe988ef1e461e5ed66 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 4 Sep 2024 10:49:06 +0200
Subject: [PATCH 131/157] GH-40216: [Python][CI][Packaging] Don't upload sdist
 to scientific-python nightly channel (only wheels) (#43943)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

See https://github.com/apache/arrow/issues/40216#issuecomment-2325960675 for context. It might be expected that the channel only holds wheels, to users (downstream projects' CI) would accidentally try build from the sdist (e.g. when a wheel is missing).

* GitHub Issue: #40216

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
---
 dev/tasks/python-sdist/github.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dev/tasks/python-sdist/github.yml b/dev/tasks/python-sdist/github.yml
index ce41f437946..ef36e358aa9 100644
--- a/dev/tasks/python-sdist/github.yml
+++ b/dev/tasks/python-sdist/github.yml
@@ -43,4 +43,3 @@ jobs:
 
       {{ macros.github_upload_releases("arrow/python/dist/*.tar.gz")|indent }}
       {{ macros.github_upload_gemfury("arrow/python/dist/*.tar.gz")|indent }}
-      {{ macros.github_upload_wheel_scientific_python("arrow/python/dist/*.tar.gz")|indent }}

From 9abad7a21d33ea20ce66ebbe3a2be26515eee279 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 4 Sep 2024 11:56:50 +0200
Subject: [PATCH 132/157] GH-43669: [Docs][Dev] Document archery --debug flag
 in section about docker (#43935)

### Rationale for this change

This feature was added in https://github.com/apache/arrow/pull/40129, but adding it to the docker page in the developer docs for better visibility.

* GitHub Issue: #43669

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 .../developers/continuous_integration/docker.rst     | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/source/developers/continuous_integration/docker.rst b/docs/source/developers/continuous_integration/docker.rst
index 68f3c7d7097..129b5d0bcf1 100644
--- a/docs/source/developers/continuous_integration/docker.rst
+++ b/docs/source/developers/continuous_integration/docker.rst
@@ -156,6 +156,18 @@ The following example starts an interactive ``bash`` session in the container
 
     archery docker run ubuntu-cpp bash
 
+**Build the image with increased debugging output:**
+
+To enable additional logging output for debugging, pass the ``--debug`` flag
+to ``archery``.
+
+.. code:: bash
+
+    archery --debug docker run ubuntu-cpp
+
+In addition to enabling ``DEBUG``-level logging, this also translates to
+passing ``--progress=plain`` to docker(-compose) build command.
+
 Docker Volume Caches
 ~~~~~~~~~~~~~~~~~~~~
 

From 6382c0a5560f711370551922cf74b0c8eab32328 Mon Sep 17 00:00:00 2001
From: ndglover <neilglover@gmail.com>
Date: Wed, 4 Sep 2024 14:05:17 +0100
Subject: [PATCH 133/157] GH-43672: [C#] Schema should be optional on
 FlightInfo (#43673)

### Rationale for this change

Schema is not required on a FlightInfo message and sometimes needs to be lazily evaluated on the server. This PR allows schema to be null on the FlightInfo since it will be picked up later when requests with those tickets are made.

### What changes are included in this PR?

### Are these changes tested?
Yes, added a test to confirm this behaviour

### Are there any user-facing changes?

* GitHub Issue: #43672

Authored-by: neilglover <neilglover@gmail.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 csharp/src/Apache.Arrow.Flight/FlightInfo.cs  |  5 ++--
 .../Apache.Arrow.Flight.Tests/FlightTests.cs  | 29 +++++++++++++++++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/csharp/src/Apache.Arrow.Flight/FlightInfo.cs b/csharp/src/Apache.Arrow.Flight/FlightInfo.cs
index 16ddb6fbfb8..e2452ac9ff4 100644
--- a/csharp/src/Apache.Arrow.Flight/FlightInfo.cs
+++ b/csharp/src/Apache.Arrow.Flight/FlightInfo.cs
@@ -18,6 +18,7 @@
 using System.Text;
 using Apache.Arrow.Flight.Internal;
 using Apache.Arrow.Ipc;
+using Google.Protobuf;
 
 namespace Apache.Arrow.Flight
 {
@@ -25,7 +26,7 @@ public class FlightInfo
     {
         internal FlightInfo(Protocol.FlightInfo flightInfo)
         {
-            Schema = FlightMessageSerializer.DecodeSchema(flightInfo.Schema.Memory);
+            Schema = flightInfo.Schema?.Length > 0 ? FlightMessageSerializer.DecodeSchema(flightInfo.Schema.Memory) : null;
             Descriptor = new FlightDescriptor(flightInfo.FlightDescriptor);
 
             var endpoints = new List<FlightEndpoint>();
@@ -60,7 +61,7 @@ public FlightInfo(Schema schema, FlightDescriptor descriptor, IReadOnlyList<Flig
 
         internal Protocol.FlightInfo ToProtocol()
         {
-            var serializedSchema = SchemaWriter.SerializeSchema(Schema);
+            var serializedSchema = Schema != null ? SchemaWriter.SerializeSchema(Schema) : ByteString.Empty;
             var response = new Protocol.FlightInfo()
             {
                 Schema = serializedSchema,
diff --git a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
index 8bf6e1120c6..0e82673d022 100644
--- a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
+++ b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
@@ -71,14 +71,14 @@ private FlightInfo GivenStoreBatches(FlightDescriptor flightDescriptor, params R
         {
             var initialBatch = batches.FirstOrDefault();
 
-            var flightHolder = new FlightHolder(flightDescriptor, initialBatch.RecordBatch.Schema, _testWebFactory.GetAddress());
+            var flightHolder = new FlightHolder(flightDescriptor, initialBatch?.RecordBatch.Schema, _testWebFactory.GetAddress());
 
             foreach (var batch in batches)
             {
                 flightHolder.AddBatch(batch);
             }
 
-            _flightStore.Flights.Add(flightDescriptor, flightHolder);
+            _flightStore.Flights[flightDescriptor] = flightHolder;
 
             return flightHolder.GetFlightInfo();
         }
@@ -124,6 +124,31 @@ public async Task TestPutTwoRecordBatches()
             ArrowReaderVerifier.CompareBatches(expectedBatch2, actualBatches[1].RecordBatch);
         }
 
+        [Fact]
+        public async Task TestGetRecordBatchWithDelayedSchema()
+        {
+            var flightDescriptor = FlightDescriptor.CreatePathDescriptor("test");
+            var expectedBatch = CreateTestBatch(0, 100);
+
+            //Add flight info only to the in memory store without schema or batch
+            GivenStoreBatches(flightDescriptor);
+
+            //Get the flight info for the ticket and verify the schema is null
+            var flightInfo = await _flightClient.GetInfo(flightDescriptor);
+            Assert.Single(flightInfo.Endpoints);
+            Assert.Null(flightInfo.Schema);
+
+            var endpoint = flightInfo.Endpoints.FirstOrDefault();
+
+            //Update the store with the batch and schema
+            GivenStoreBatches(flightDescriptor, new RecordBatchWithMetadata(expectedBatch));
+            var getStream = _flightClient.GetStream(endpoint.Ticket);
+            var resultList = await getStream.ResponseStream.ToListAsync();
+
+            Assert.Single(resultList);
+            ArrowReaderVerifier.CompareBatches(expectedBatch, resultList[0]);
+        }
+
         [Fact]
         public async Task TestGetSingleRecordBatch()
         {

From 50219ef69f712ca12fa5c85367a4cae9776d1c99 Mon Sep 17 00:00:00 2001
From: Dane Pitkin <dpitkin@apache.org>
Date: Wed, 4 Sep 2024 11:01:24 -0400
Subject: [PATCH 134/157] GH-43728: [Python] ChunkedArray fails gracefully on
 non-cpu devices (#43795)

### Rationale for this change

ChunkedArrays that are backed by non-cpu memory should not segfault when the user invokes an incompatible API.

### What changes are included in this PR?

* Add IsCpu() to ChunkedArray
* Throw a python exception for known incompatible APIs on non-cpu device

### Are these changes tested?

Unit tests

### Are there any user-facing changes?

The user should no longer see segfaults for certain APIs, just python exceptions.
* GitHub Issue: #43728

Authored-by: Dane Pitkin <dpitkin@apache.org>
Signed-off-by: Dane Pitkin <dpitkin@apache.org>
---
 python/pyarrow/includes/libarrow.pxd |   2 +
 python/pyarrow/lib.pxd               |   2 +
 python/pyarrow/table.pxi             |  45 ++++++-
 python/pyarrow/tests/test_table.py   | 177 +++++++++++++++++++++++++--
 4 files changed, 218 insertions(+), 8 deletions(-)

diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index c2346750a19..8e6922a912a 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -983,6 +983,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
 
         CResult[vector[shared_ptr[CChunkedArray]]] Flatten(CMemoryPool* pool)
 
+        c_bool is_cpu() const
+
         CStatus Validate() const
         CStatus ValidateFull() const
 
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index ad05ea31c91..1caf58e20e6 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -506,6 +506,8 @@ cdef class ChunkedArray(_PandasConvertible):
     cdef:
         shared_ptr[CChunkedArray] sp_chunked_array
         CChunkedArray* chunked_array
+        c_bool _is_cpu
+        c_bool _init_is_cpu
 
     cdef readonly:
         # To allow Table to propagate metadata to pandas.Series
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 9bb86236659..3b0df981e01 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -59,6 +59,7 @@ cdef class ChunkedArray(_PandasConvertible):
 
     def __cinit__(self):
         self.chunked_array = NULL
+        self._init_is_cpu = False
 
     def __init__(self):
         raise TypeError("Do not call ChunkedArray's constructor directly, use "
@@ -69,6 +70,7 @@ cdef class ChunkedArray(_PandasConvertible):
         self.chunked_array = chunked_array.get()
 
     def __reduce__(self):
+        self._assert_cpu()
         return chunked_array, (self.chunks, self.type)
 
     @property
@@ -198,6 +200,7 @@ cdef class ChunkedArray(_PandasConvertible):
         ArrowInvalid
         """
         if full:
+            self._assert_cpu()
             with nogil:
                 check_status(self.sp_chunked_array.get().ValidateFull())
         else:
@@ -220,6 +223,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.null_count
         1
         """
+        self._assert_cpu()
         return self.chunked_array.null_count()
 
     @property
@@ -245,6 +249,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.nbytes
         49
         """
+        self._assert_cpu()
         cdef:
             CResult[int64_t] c_res_buffer
 
@@ -271,6 +276,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.get_total_buffer_size()
         49
         """
+        self._assert_cpu()
         cdef:
             int64_t total_buffer_size
 
@@ -299,13 +305,14 @@ cdef class ChunkedArray(_PandasConvertible):
         -------
         value : Scalar (index) or ChunkedArray (slice)
         """
-
+        self._assert_cpu()
         if isinstance(key, slice):
             return _normalize_slice(self, key)
 
         return self.getitem(_normalize_index(key, self.chunked_array.length()))
 
     cdef getitem(self, int64_t i):
+        self._assert_cpu()
         return Scalar.wrap(GetResultValue(self.chunked_array.GetScalar(i)))
 
     def is_null(self, *, nan_is_null=False):
@@ -338,6 +345,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         options = _pc().NullOptions(nan_is_null=nan_is_null)
         return _pc().call_function('is_null', [self], options)
 
@@ -363,6 +371,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().is_nan(self)
 
     def is_valid(self):
@@ -388,6 +397,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().is_valid(self)
 
     def __eq__(self, other):
@@ -430,6 +440,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().fill_null(self, fill_value)
 
     def equals(self, ChunkedArray other):
@@ -458,6 +469,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.equals(animals)
         False
         """
+        self._assert_cpu()
         if other is None:
             return False
 
@@ -472,6 +484,7 @@ cdef class ChunkedArray(_PandasConvertible):
         return result
 
     def _to_pandas(self, options, types_mapper=None, **kwargs):
+        self._assert_cpu()
         return _array_like_to_pandas(self, options, types_mapper=types_mapper)
 
     def to_numpy(self, zero_copy_only=False):
@@ -495,6 +508,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.to_numpy()
         array([  2,   2,   4,   4,   5, 100])
         """
+        self._assert_cpu()
         if np is None:
             raise ImportError(
                 "Cannot return a numpy.ndarray if NumPy is not present")
@@ -529,6 +543,7 @@ cdef class ChunkedArray(_PandasConvertible):
         return values
 
     def __array__(self, dtype=None, copy=None):
+        self._assert_cpu()
         if copy is False:
             raise ValueError(
                 "Unable to avoid a copy while creating a numpy array as requested "
@@ -574,6 +589,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs_seconds.type
         DurationType(duration[s])
         """
+        self._assert_cpu()
         return _pc().cast(self, target_type, safe=safe, options=options)
 
     def dictionary_encode(self, null_encoding='mask'):
@@ -636,6 +652,7 @@ cdef class ChunkedArray(_PandasConvertible):
             ]
         ]
         """
+        self._assert_cpu()
         options = _pc().DictionaryEncodeOptions(null_encoding)
         return _pc().call_function('dictionary_encode', [self], options)
 
@@ -700,6 +717,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.type
         DataType(int64)
         """
+        self._assert_cpu()
         cdef:
             vector[shared_ptr[CChunkedArray]] flattened
             CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
@@ -751,6 +769,7 @@ cdef class ChunkedArray(_PandasConvertible):
           100
         ]
         """
+        self._assert_cpu()
         if self.num_chunks == 0:
             return array([], type=self.type)
         else:
@@ -791,6 +810,7 @@ cdef class ChunkedArray(_PandasConvertible):
           100
         ]
         """
+        self._assert_cpu()
         return _pc().call_function('unique', [self])
 
     def value_counts(self):
@@ -837,6 +857,7 @@ cdef class ChunkedArray(_PandasConvertible):
             1
           ]
         """
+        self._assert_cpu()
         return _pc().call_function('value_counts', [self])
 
     def slice(self, offset=0, length=None):
@@ -959,6 +980,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().filter(self, mask, null_selection_behavior)
 
     def index(self, value, start=None, end=None, *, memory_pool=None):
@@ -1006,6 +1028,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.index(4, start=3)
         <pyarrow.Int64Scalar: 3>
         """
+        self._assert_cpu()
         return _pc().index(self, value, start, end, memory_pool=memory_pool)
 
     def take(self, object indices):
@@ -1052,6 +1075,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().take(self, indices)
 
     def drop_null(self):
@@ -1091,6 +1115,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().drop_null(self)
 
     def sort(self, order="ascending", **kwargs):
@@ -1110,6 +1135,7 @@ cdef class ChunkedArray(_PandasConvertible):
         -------
         result : ChunkedArray
         """
+        self._assert_cpu()
         indices = _pc().sort_indices(
             self,
             options=_pc().SortOptions(sort_keys=[("", order)], **kwargs)
@@ -1209,6 +1235,7 @@ cdef class ChunkedArray(_PandasConvertible):
             ]
         ]
         """
+        self._assert_cpu()
         cdef:
             CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
             shared_ptr[CChunkedArray] c_result
@@ -1333,6 +1360,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.to_pylist()
         [2, 2, 4, 4, None, 100]
         """
+        self._assert_cpu()
         result = []
         for i in range(self.num_chunks):
             result += self.chunk(i).to_pylist()
@@ -1354,6 +1382,7 @@ cdef class ChunkedArray(_PandasConvertible):
         PyCapsule
             A capsule containing a C ArrowArrayStream struct.
         """
+        self._assert_cpu()
         cdef:
             ChunkedArray chunked
             ArrowArrayStream* c_stream = NULL
@@ -1410,6 +1439,20 @@ cdef class ChunkedArray(_PandasConvertible):
         self.init(c_chunked_array)
         return self
 
+    @property
+    def is_cpu(self):
+        """
+        Whether all chunks in the ChunkedArray are CPU-accessible.
+        """
+        if not self._init_is_cpu:
+            self._is_cpu = self.chunked_array.is_cpu()
+            self._init_is_cpu = True
+        return self._is_cpu
+
+    def _assert_cpu(self):
+        if not self.is_cpu:
+            raise NotImplementedError("Implemented only for data on CPU device")
+
 
 def chunked_array(arrays, type=None):
     """
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 57765985505..c3f805b4b32 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -3385,13 +3385,13 @@ def cuda_context():
 
 @pytest.fixture
 def schema():
-    return pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())])
+    return pa.schema([pa.field('c0', pa.int32()), pa.field('c1', pa.int32())])
 
 
 @pytest.fixture
-def cpu_arrays():
-    return [pa.array([1, 2, 3, 4, 5], pa.int16()),
-            pa.array([-10, -5, 0, 1, 10], pa.int32())]
+def cpu_arrays(schema):
+    return [pa.array([1, 2, 3, 4, 5], schema.field(0).type),
+            pa.array([-10, -5, 0, None, 10], schema.field(1).type)]
 
 
 @pytest.fixture
@@ -3399,6 +3399,27 @@ def cuda_arrays(cuda_context, cpu_arrays):
     return [arr.copy_to(cuda_context.memory_manager) for arr in cpu_arrays]
 
 
+@pytest.fixture
+def cpu_chunked_array(cpu_arrays):
+    chunked_array = pa.chunked_array(cpu_arrays)
+    assert chunked_array.is_cpu is True
+    return chunked_array
+
+
+@pytest.fixture
+def cuda_chunked_array(cuda_arrays):
+    chunked_array = pa.chunked_array(cuda_arrays)
+    assert chunked_array.is_cpu is False
+    return chunked_array
+
+
+@pytest.fixture
+def cpu_and_cuda_chunked_array(cpu_arrays, cuda_arrays):
+    chunked_array = pa.chunked_array(cpu_arrays + cuda_arrays)
+    assert chunked_array.is_cpu is False
+    return chunked_array
+
+
 @pytest.fixture
 def cpu_recordbatch(cpu_arrays, schema):
     return pa.record_batch(cpu_arrays, schema=schema)
@@ -3409,6 +3430,147 @@ def cuda_recordbatch(cuda_context, cpu_recordbatch):
     return cpu_recordbatch.copy_to(cuda_context.memory_manager)
 
 
+def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_array,
+                               cpu_and_cuda_chunked_array):
+    # type test
+    assert cuda_chunked_array.type == cpu_chunked_array.type
+
+    # length() test
+    assert cuda_chunked_array.length() == cpu_chunked_array.length()
+
+    # str() test
+    assert str(cuda_chunked_array) == str(cpu_chunked_array)
+
+    # repr() test
+    assert str(cuda_chunked_array) in repr(cuda_chunked_array)
+
+    # validate() test
+    cuda_chunked_array.validate()
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.validate(full=True)
+
+    # null_count test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.null_count
+
+    # nbytes() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.nbytes
+
+    # get_total_buffer_size() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.get_total_buffer_size()
+
+    # getitem() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array[0]
+
+    # is_null() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.is_null()
+
+    # is_nan() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.is_nan()
+
+    # is_valid() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.is_valid()
+
+    # fill_null() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.fill_null(0)
+
+    # equals() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array == cuda_chunked_array
+
+    # to_pandas() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.to_pandas()
+
+    # to_numpy() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.to_numpy()
+
+    # __array__() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.__array__()
+
+    # cast() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.cast()
+
+    # dictionary_encode() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.dictionary_encode()
+
+    # flatten() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.flatten()
+
+    # combine_chunks() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.combine_chunks()
+
+    # unique() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.unique()
+
+    # value_counts() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.value_counts()
+
+    # filter() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.filter([True, False, True, False, True])
+
+    # index() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.index(5)
+
+    # slice() test
+    cuda_chunked_array.slice(2, 2)
+
+    # take() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.take([1])
+
+    # drop_null() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.drop_null()
+
+    # sort() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.sort()
+
+    # unify_dictionaries() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.unify_dictionaries()
+
+    # num_chunks test
+    assert cuda_chunked_array.num_chunks == cpu_chunked_array.num_chunks
+
+    # chunks test
+    assert len(cuda_chunked_array.chunks) == len(cpu_chunked_array.chunks)
+
+    # chunk() test
+    chunk = cuda_chunked_array.chunk(0)
+    assert chunk.device_type == pa.DeviceAllocationType.CUDA
+
+    # to_pylist() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.to_pylist()
+
+    # __arrow_c_stream__() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.__arrow_c_stream__()
+
+    # __reduce__() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.__reduce__()
+
+
 def verify_cuda_recordbatch(batch, expected_schema):
     batch.validate()
     assert batch.device_type == pa.DeviceAllocationType.CUDA
@@ -3480,8 +3642,8 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
         cuda_recordbatch.sort_by('c0')
 
     # field() test
-    assert cuda_recordbatch.field(0) == pa.field('c0', pa.int16())
-    assert cuda_recordbatch.field(1) == pa.field('c1', pa.int32())
+    assert cuda_recordbatch.field(0) == schema.field(0)
+    assert cuda_recordbatch.field(1) == schema.field(1)
 
     # equals() test
     new_batch = cpu_recordbatch.copy_to(cuda_context.memory_manager)
@@ -3551,7 +3713,8 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
     # rename_columns() test
     new_batch = cuda_recordbatch.rename_columns(['col0', 'col1'])
     expected_schema = pa.schema(
-        [pa.field('col0', pa.int16()), pa.field('col1', pa.int32())])
+        [pa.field('col0', schema.field(0).type),
+         pa.field('col1', schema.field(1).type)])
     verify_cuda_recordbatch(new_batch, expected_schema=expected_schema)
 
     # validate() test

From 5ca12bd7c109cdc362f929ada12d4de79148a1c7 Mon Sep 17 00:00:00 2001
From: Amit Mittal <eramitmittal@users.noreply.github.com>
Date: Thu, 5 Sep 2024 06:17:11 +0530
Subject: [PATCH 135/157] GH-38255: [Java] Implement Flight SQL Bulk Ingestion
 (#43551)

Please look at #38255 for details on this functionality. Support for Go and C++ was added as part of #38385.
This pull request is to add the required support for Java.
* GitHub Issue: #38255

Lead-authored-by: Amit Mittal <amit.mittal@iongroup.com>
Co-authored-by: Amit Mittal <eramitmittal@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 dev/archery/archery/integration/runner.py     |   2 +-
 .../tests/FlightSqlExtensionScenario.java     |  96 ++------
 .../tests/FlightSqlIngestionScenario.java     | 106 ++++++++
 .../integration/tests/FlightSqlScenario.java  |  77 +++++-
 .../tests/FlightSqlScenarioProducer.java      |  91 +++++--
 .../flight/integration/tests/Scenarios.java   |   1 +
 .../tests/TestBufferAllocationListener.java   |  73 ++++++
 .../integration/tests/IntegrationTest.java    |  29 ++-
 java/flight/flight-sql/pom.xml                |   6 +
 .../arrow/flight/sql/FlightSqlClient.java     | 226 +++++++++++++++++-
 .../arrow/flight/sql/FlightSqlProducer.java   |  31 ++-
 .../flight/sql/NoOpFlightSqlProducer.java     |  12 +
 .../arrow/flight/sql/SqlInfoBuilder.java      |  11 +
 .../flight/sql/example/FlightSqlExample.java  | 171 ++++++++++++-
 .../arrow/flight/sql/test/TestFlightSql.java  | 161 +++++++++++++
 15 files changed, 990 insertions(+), 103 deletions(-)
 create mode 100644 java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlIngestionScenario.java
 create mode 100644 java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/TestBufferAllocationListener.java

diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
index 0ea244720cc..66c8721519e 100644
--- a/dev/archery/archery/integration/runner.py
+++ b/dev/archery/archery/integration/runner.py
@@ -645,7 +645,7 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True,
         Scenario(
             "flight_sql:ingestion",
             description="Ensure Flight SQL ingestion works as expected.",
-            skip_testers={"JS", "C#", "Rust", "Java"}
+            skip_testers={"JS", "C#", "Rust"}
         ),
     ]
 
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlExtensionScenario.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlExtensionScenario.java
index 76d79b22662..69b02030ccd 100644
--- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlExtensionScenario.java
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlExtensionScenario.java
@@ -16,24 +16,17 @@
  */
 package org.apache.arrow.flight.integration.tests;
 
-import java.util.HashMap;
 import java.util.Map;
 import org.apache.arrow.flight.FlightClient;
 import org.apache.arrow.flight.FlightInfo;
-import org.apache.arrow.flight.FlightStream;
 import org.apache.arrow.flight.Location;
 import org.apache.arrow.flight.SchemaResult;
-import org.apache.arrow.flight.Ticket;
 import org.apache.arrow.flight.sql.CancelResult;
 import org.apache.arrow.flight.sql.FlightSqlClient;
 import org.apache.arrow.flight.sql.FlightSqlProducer;
 import org.apache.arrow.flight.sql.impl.FlightSql;
 import org.apache.arrow.memory.BufferAllocator;
-import org.apache.arrow.util.Preconditions;
-import org.apache.arrow.vector.UInt4Vector;
 import org.apache.arrow.vector.VectorSchemaRoot;
-import org.apache.arrow.vector.complex.DenseUnionVector;
-import org.apache.arrow.vector.types.pojo.Schema;
 
 /**
  * Integration test scenario for validating Flight SQL specs across multiple implementations. This
@@ -53,69 +46,32 @@ public void client(BufferAllocator allocator, Location location, FlightClient cl
   }
 
   private void validateMetadataRetrieval(FlightSqlClient sqlClient) throws Exception {
-    FlightInfo info = sqlClient.getSqlInfo();
-    Ticket ticket = info.getEndpoints().get(0).getTicket();
-
-    Map<Integer, Object> infoValues = new HashMap<>();
-    try (FlightStream stream = sqlClient.getStream(ticket)) {
-      Schema actualSchema = stream.getSchema();
-      IntegrationAssertions.assertEquals(
-          FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA, actualSchema);
-
-      while (stream.next()) {
-        UInt4Vector infoName = (UInt4Vector) stream.getRoot().getVector(0);
-        DenseUnionVector value = (DenseUnionVector) stream.getRoot().getVector(1);
-
-        for (int i = 0; i < stream.getRoot().getRowCount(); i++) {
-          final int code = infoName.get(i);
-          if (infoValues.containsKey(code)) {
-            throw new AssertionError("Duplicate SqlInfo value: " + code);
-          }
-          Object object;
-          byte typeId = value.getTypeId(i);
-          switch (typeId) {
-            case 0: // string
-              object =
-                  Preconditions.checkNotNull(
-                          value.getVarCharVector(typeId).getObject(value.getOffset(i)))
-                      .toString();
-              break;
-            case 1: // bool
-              object = value.getBitVector(typeId).getObject(value.getOffset(i));
-              break;
-            case 2: // int64
-              object = value.getBigIntVector(typeId).getObject(value.getOffset(i));
-              break;
-            case 3: // int32
-              object = value.getIntVector(typeId).getObject(value.getOffset(i));
-              break;
-            default:
-              throw new AssertionError("Decoding SqlInfo of type code " + typeId);
-          }
-          infoValues.put(code, object);
-        }
-      }
-    }
-
-    IntegrationAssertions.assertEquals(
-        Boolean.FALSE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SQL_VALUE));
-    IntegrationAssertions.assertEquals(
-        Boolean.TRUE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_VALUE));
-    IntegrationAssertions.assertEquals(
-        "min_version",
-        infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_MIN_VERSION_VALUE));
-    IntegrationAssertions.assertEquals(
-        "max_version",
-        infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_MAX_VERSION_VALUE));
-    IntegrationAssertions.assertEquals(
-        FlightSql.SqlSupportedTransaction.SQL_SUPPORTED_TRANSACTION_SAVEPOINT_VALUE,
-        infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_TRANSACTION_VALUE));
-    IntegrationAssertions.assertEquals(
-        Boolean.TRUE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_CANCEL_VALUE));
-    IntegrationAssertions.assertEquals(
-        42, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT_VALUE));
-    IntegrationAssertions.assertEquals(
-        7, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_TRANSACTION_TIMEOUT_VALUE));
+    validate(
+        FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA,
+        sqlClient.getSqlInfo(),
+        sqlClient,
+        s -> {
+          Map<Integer, Object> infoValues = readSqlInfoStream(s);
+          IntegrationAssertions.assertEquals(
+              Boolean.FALSE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SQL_VALUE));
+          IntegrationAssertions.assertEquals(
+              Boolean.TRUE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_VALUE));
+          IntegrationAssertions.assertEquals(
+              "min_version",
+              infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_MIN_VERSION_VALUE));
+          IntegrationAssertions.assertEquals(
+              "max_version",
+              infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_MAX_VERSION_VALUE));
+          IntegrationAssertions.assertEquals(
+              FlightSql.SqlSupportedTransaction.SQL_SUPPORTED_TRANSACTION_SAVEPOINT_VALUE,
+              infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_TRANSACTION_VALUE));
+          IntegrationAssertions.assertEquals(
+              Boolean.TRUE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_CANCEL_VALUE));
+          IntegrationAssertions.assertEquals(
+              42, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT_VALUE));
+          IntegrationAssertions.assertEquals(
+              7, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_TRANSACTION_TIMEOUT_VALUE));
+        });
   }
 
   private void validateStatementExecution(FlightSqlClient sqlClient) throws Exception {
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlIngestionScenario.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlIngestionScenario.java
new file mode 100644
index 00000000000..981ce89f1b8
--- /dev/null
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlIngestionScenario.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.flight.integration.tests;
+
+import com.google.common.collect.ImmutableMap;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.arrow.flight.FlightClient;
+import org.apache.arrow.flight.FlightProducer;
+import org.apache.arrow.flight.Location;
+import org.apache.arrow.flight.sql.FlightSqlClient;
+import org.apache.arrow.flight.sql.FlightSqlClient.ExecuteIngestOptions;
+import org.apache.arrow.flight.sql.FlightSqlProducer;
+import org.apache.arrow.flight.sql.impl.FlightSql;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.types.pojo.Schema;
+
+/**
+ * Integration test scenario for validating Flight SQL specs across multiple implementations. This
+ * should ensure that RPC objects are being built and parsed correctly for multiple languages and
+ * that the Arrow schemas are returned as expected.
+ */
+public class FlightSqlIngestionScenario extends FlightSqlScenario {
+
+  @Override
+  public FlightProducer producer(BufferAllocator allocator, Location location) throws Exception {
+    FlightSqlScenarioProducer producer =
+        (FlightSqlScenarioProducer) super.producer(allocator, location);
+    producer
+        .getSqlInfoBuilder()
+        .withFlightSqlServerBulkIngestionTransaction(true)
+        .withFlightSqlServerBulkIngestion(true);
+    return producer;
+  }
+
+  @Override
+  public void client(BufferAllocator allocator, Location location, FlightClient client)
+      throws Exception {
+    try (final FlightSqlClient sqlClient = new FlightSqlClient(client)) {
+      validateMetadataRetrieval(sqlClient);
+      validateIngestion(allocator, sqlClient);
+    }
+  }
+
+  private void validateMetadataRetrieval(FlightSqlClient sqlClient) throws Exception {
+    validate(
+        FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA,
+        sqlClient.getSqlInfo(
+            FlightSql.SqlInfo.FLIGHT_SQL_SERVER_INGEST_TRANSACTIONS_SUPPORTED,
+            FlightSql.SqlInfo.FLIGHT_SQL_SERVER_BULK_INGESTION),
+        sqlClient,
+        s -> {
+          Map<Integer, Object> infoValues = readSqlInfoStream(s);
+          IntegrationAssertions.assertEquals(
+              Boolean.TRUE,
+              infoValues.get(
+                  FlightSql.SqlInfo.FLIGHT_SQL_SERVER_INGEST_TRANSACTIONS_SUPPORTED_VALUE));
+          IntegrationAssertions.assertEquals(
+              Boolean.TRUE,
+              infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_BULK_INGESTION_VALUE));
+        });
+  }
+
+  private VectorSchemaRoot getIngestVectorRoot(BufferAllocator allocator) {
+    Schema schema = FlightSqlScenarioProducer.getIngestSchema();
+    VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator);
+    root.setRowCount(3);
+    return root;
+  }
+
+  private void validateIngestion(BufferAllocator allocator, FlightSqlClient sqlClient) {
+    try (VectorSchemaRoot data = getIngestVectorRoot(allocator)) {
+      TableDefinitionOptions tableDefinitionOptions =
+          TableDefinitionOptions.newBuilder()
+              .setIfExists(TableDefinitionOptions.TableExistsOption.TABLE_EXISTS_OPTION_REPLACE)
+              .setIfNotExist(
+                  TableDefinitionOptions.TableNotExistOption.TABLE_NOT_EXIST_OPTION_CREATE)
+              .build();
+      Map<String, String> options = new HashMap<>(ImmutableMap.of("key1", "val1", "key2", "val2"));
+      ExecuteIngestOptions executeIngestOptions =
+          new ExecuteIngestOptions(
+              "test_table", tableDefinitionOptions, true, "test_catalog", "test_schema", options);
+      FlightSqlClient.Transaction transaction =
+          new FlightSqlClient.Transaction(BULK_INGEST_TRANSACTION_ID);
+      long updatedRows = sqlClient.executeIngest(data, executeIngestOptions, transaction);
+
+      IntegrationAssertions.assertEquals(3L, updatedRows);
+    }
+  }
+}
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenario.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenario.java
index 8918b252700..e370a30bdc6 100644
--- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenario.java
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenario.java
@@ -16,8 +16,14 @@
  */
 package org.apache.arrow.flight.integration.tests;
 
+import static java.util.Objects.isNull;
+
+import com.google.protobuf.Any;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Consumer;
 import org.apache.arrow.flight.CallOption;
 import org.apache.arrow.flight.FlightClient;
 import org.apache.arrow.flight.FlightInfo;
@@ -29,10 +35,14 @@
 import org.apache.arrow.flight.Ticket;
 import org.apache.arrow.flight.sql.FlightSqlClient;
 import org.apache.arrow.flight.sql.FlightSqlProducer;
+import org.apache.arrow.flight.sql.FlightSqlUtils;
 import org.apache.arrow.flight.sql.impl.FlightSql;
 import org.apache.arrow.flight.sql.util.TableRef;
 import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.util.Preconditions;
+import org.apache.arrow.vector.UInt4Vector;
 import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.complex.DenseUnionVector;
 import org.apache.arrow.vector.types.pojo.Schema;
 
 /**
@@ -52,6 +62,7 @@ public class FlightSqlScenario implements Scenario {
   public static final FlightSqlClient.SubstraitPlan SUBSTRAIT_PLAN =
       new FlightSqlClient.SubstraitPlan(SUBSTRAIT_PLAN_TEXT, SUBSTRAIT_VERSION);
   public static final byte[] TRANSACTION_ID = "transaction_id".getBytes(StandardCharsets.UTF_8);
+  public static final byte[] BULK_INGEST_TRANSACTION_ID = "123".getBytes(StandardCharsets.UTF_8);
 
   @Override
   public FlightProducer producer(BufferAllocator allocator, Location location) throws Exception {
@@ -150,15 +161,23 @@ private void validateMetadataRetrieval(FlightSqlClient sqlClient) throws Excepti
     validateSchema(
         FlightSqlProducer.Schemas.GET_TYPE_INFO_SCHEMA, sqlClient.getXdbcTypeInfoSchema(options));
 
-    validate(
-        FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA,
+    FlightInfo sqlInfoFlightInfo =
         sqlClient.getSqlInfo(
             new FlightSql.SqlInfo[] {
               FlightSql.SqlInfo.FLIGHT_SQL_SERVER_NAME,
               FlightSql.SqlInfo.FLIGHT_SQL_SERVER_READ_ONLY
             },
-            options),
-        sqlClient);
+            options);
+
+    Ticket ticket = sqlInfoFlightInfo.getEndpoints().get(0).getTicket();
+    FlightSql.CommandGetSqlInfo requestSqlInfoCommand =
+        FlightSqlUtils.unpackOrThrow(
+            Any.parseFrom(ticket.getBytes()), FlightSql.CommandGetSqlInfo.class);
+    IntegrationAssertions.assertEquals(
+        requestSqlInfoCommand.getInfo(0), FlightSql.SqlInfo.FLIGHT_SQL_SERVER_NAME_VALUE);
+    IntegrationAssertions.assertEquals(
+        requestSqlInfoCommand.getInfo(1), FlightSql.SqlInfo.FLIGHT_SQL_SERVER_READ_ONLY_VALUE);
+    validate(FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA, sqlInfoFlightInfo, sqlClient);
     validateSchema(
         FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA, sqlClient.getSqlInfoSchema(options));
   }
@@ -194,14 +213,64 @@ private void validatePreparedStatementExecution(
 
   protected void validate(Schema expectedSchema, FlightInfo flightInfo, FlightSqlClient sqlClient)
       throws Exception {
+    validate(expectedSchema, flightInfo, sqlClient, null);
+  }
+
+  protected void validate(
+      Schema expectedSchema,
+      FlightInfo flightInfo,
+      FlightSqlClient sqlClient,
+      Consumer<FlightStream> streamConsumer)
+      throws Exception {
     Ticket ticket = flightInfo.getEndpoints().get(0).getTicket();
     try (FlightStream stream = sqlClient.getStream(ticket)) {
       Schema actualSchema = stream.getSchema();
       IntegrationAssertions.assertEquals(expectedSchema, actualSchema);
+      if (!isNull(streamConsumer)) {
+        streamConsumer.accept(stream);
+      }
     }
   }
 
   protected void validateSchema(Schema expected, SchemaResult actual) {
     IntegrationAssertions.assertEquals(expected, actual.getSchema());
   }
+
+  protected Map<Integer, Object> readSqlInfoStream(FlightStream stream) {
+    Map<Integer, Object> infoValues = new HashMap<>();
+    while (stream.next()) {
+      UInt4Vector infoName = (UInt4Vector) stream.getRoot().getVector(0);
+      DenseUnionVector value = (DenseUnionVector) stream.getRoot().getVector(1);
+
+      for (int i = 0; i < stream.getRoot().getRowCount(); i++) {
+        final int code = infoName.get(i);
+        if (infoValues.containsKey(code)) {
+          throw new AssertionError("Duplicate SqlInfo value: " + code);
+        }
+        Object object;
+        byte typeId = value.getTypeId(i);
+        switch (typeId) {
+          case 0: // string
+            object =
+                Preconditions.checkNotNull(
+                        value.getVarCharVector(typeId).getObject(value.getOffset(i)))
+                    .toString();
+            break;
+          case 1: // bool
+            object = value.getBitVector(typeId).getObject(value.getOffset(i));
+            break;
+          case 2: // int64
+            object = value.getBigIntVector(typeId).getObject(value.getOffset(i));
+            break;
+          case 3: // int32
+            object = value.getIntVector(typeId).getObject(value.getOffset(i));
+            break;
+          default:
+            throw new AssertionError("Decoding SqlInfo of type code " + typeId);
+        }
+        infoValues.put(code, object);
+      }
+    }
+    return infoValues;
+  }
 }
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenarioProducer.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenarioProducer.java
index b7a75b459d1..be746b57576 100644
--- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenarioProducer.java
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenarioProducer.java
@@ -16,13 +16,16 @@
  */
 package org.apache.arrow.flight.integration.tests;
 
+import com.google.common.collect.ImmutableMap;
 import com.google.protobuf.Any;
 import com.google.protobuf.ByteString;
 import com.google.protobuf.InvalidProtocolBufferException;
 import com.google.protobuf.Message;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import org.apache.arrow.flight.CallStatus;
 import org.apache.arrow.flight.Criteria;
 import org.apache.arrow.flight.FlightDescriptor;
@@ -38,6 +41,8 @@
 import org.apache.arrow.flight.sql.FlightSqlProducer;
 import org.apache.arrow.flight.sql.SqlInfoBuilder;
 import org.apache.arrow.flight.sql.impl.FlightSql;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableExistsOption;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableNotExistOption;
 import org.apache.arrow.memory.ArrowBuf;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.VectorSchemaRoot;
@@ -48,10 +53,27 @@
 
 /** Hardcoded Flight SQL producer used for cross-language integration tests. */
 public class FlightSqlScenarioProducer implements FlightSqlProducer {
+  public static final String SERVER_NAME = "Flight SQL Integration Test Server";
   private final BufferAllocator allocator;
 
+  private final SqlInfoBuilder sqlInfoBuilder;
+
+  /** Constructor. */
   public FlightSqlScenarioProducer(BufferAllocator allocator) {
     this.allocator = allocator;
+    sqlInfoBuilder =
+        new SqlInfoBuilder()
+            .withFlightSqlServerName(SERVER_NAME)
+            .withFlightSqlServerReadOnly(false)
+            .withFlightSqlServerSql(false)
+            .withFlightSqlServerSubstrait(true)
+            .withFlightSqlServerSubstraitMinVersion("min_version")
+            .withFlightSqlServerSubstraitMaxVersion("max_version")
+            .withFlightSqlServerTransaction(
+                FlightSql.SqlSupportedTransaction.SQL_SUPPORTED_TRANSACTION_SAVEPOINT)
+            .withFlightSqlServerCancel(true)
+            .withFlightSqlServerStatementTimeout(42)
+            .withFlightSqlServerTransactionTimeout(7);
   }
 
   /**
@@ -109,6 +131,15 @@ static Schema getQueryWithTransactionSchema() {
                 null)));
   }
 
+  static Schema getIngestSchema() {
+    return new Schema(
+        Collections.singletonList(Field.nullable("test_field", new ArrowType.Int(64, true))));
+  }
+
+  protected SqlInfoBuilder getSqlInfoBuilder() {
+    return sqlInfoBuilder;
+  }
+
   @Override
   public void beginSavepoint(
       FlightSql.ActionBeginSavepointRequest request,
@@ -511,6 +542,44 @@ public Runnable acceptPutStatement(
             : FlightSqlScenario.UPDATE_STATEMENT_WITH_TRANSACTION_EXPECTED_ROWS);
   }
 
+  @Override
+  public Runnable acceptPutStatementBulkIngest(
+      FlightSql.CommandStatementIngest command,
+      CallContext context,
+      FlightStream flightStream,
+      StreamListener<PutResult> ackStream) {
+
+    IntegrationAssertions.assertEquals(
+        TableExistsOption.TABLE_EXISTS_OPTION_REPLACE,
+        command.getTableDefinitionOptions().getIfExists());
+    IntegrationAssertions.assertEquals(
+        TableNotExistOption.TABLE_NOT_EXIST_OPTION_CREATE,
+        command.getTableDefinitionOptions().getIfNotExist());
+    IntegrationAssertions.assertEquals("test_table", command.getTable());
+    IntegrationAssertions.assertEquals("test_catalog", command.getCatalog());
+    IntegrationAssertions.assertEquals("test_schema", command.getSchema());
+    IntegrationAssertions.assertEquals(true, command.getTemporary());
+    IntegrationAssertions.assertEquals(
+        FlightSqlScenario.BULK_INGEST_TRANSACTION_ID, command.getTransactionId().toByteArray());
+
+    Map<String, String> expectedOptions =
+        new HashMap<>(ImmutableMap.of("key1", "val1", "key2", "val2"));
+    IntegrationAssertions.assertEquals(expectedOptions.size(), command.getOptionsCount());
+
+    for (Map.Entry<String, String> optionEntry : expectedOptions.entrySet()) {
+      String key = optionEntry.getKey();
+      IntegrationAssertions.assertEquals(optionEntry.getValue(), command.getOptionsOrThrow(key));
+    }
+
+    IntegrationAssertions.assertEquals(getIngestSchema(), flightStream.getSchema());
+    long rowCount = 0;
+    while (flightStream.next()) {
+      rowCount += flightStream.getRoot().getRowCount();
+    }
+
+    return acceptPutReturnConstant(ackStream, rowCount);
+  }
+
   @Override
   public Runnable acceptPutSubstraitPlan(
       FlightSql.CommandStatementSubstraitPlan command,
@@ -577,35 +646,19 @@ public Runnable acceptPutPreparedStatementQuery(
   @Override
   public FlightInfo getFlightInfoSqlInfo(
       FlightSql.CommandGetSqlInfo request, CallContext context, FlightDescriptor descriptor) {
-    if (request.getInfoCount() == 2) {
-      // Integration test for the protocol messages
-      IntegrationAssertions.assertEquals(
-          request.getInfo(0), FlightSql.SqlInfo.FLIGHT_SQL_SERVER_NAME_VALUE);
-      IntegrationAssertions.assertEquals(
-          request.getInfo(1), FlightSql.SqlInfo.FLIGHT_SQL_SERVER_READ_ONLY_VALUE);
-    }
     return getFlightInfoForSchema(request, descriptor, Schemas.GET_SQL_INFO_SCHEMA);
   }
 
   @Override
   public void getStreamSqlInfo(
       FlightSql.CommandGetSqlInfo command, CallContext context, ServerStreamListener listener) {
-    if (command.getInfoCount() == 2) {
+    if (command.getInfoCount() == 2
+        && command.getInfo(0) == FlightSql.SqlInfo.FLIGHT_SQL_SERVER_NAME_VALUE
+        && command.getInfo(1) == FlightSql.SqlInfo.FLIGHT_SQL_SERVER_READ_ONLY_VALUE) {
       // Integration test for the protocol messages
       putEmptyBatchToStreamListener(listener, Schemas.GET_SQL_INFO_SCHEMA);
       return;
     }
-    SqlInfoBuilder sqlInfoBuilder =
-        new SqlInfoBuilder()
-            .withFlightSqlServerSql(false)
-            .withFlightSqlServerSubstrait(true)
-            .withFlightSqlServerSubstraitMinVersion("min_version")
-            .withFlightSqlServerSubstraitMaxVersion("max_version")
-            .withFlightSqlServerTransaction(
-                FlightSql.SqlSupportedTransaction.SQL_SUPPORTED_TRANSACTION_SAVEPOINT)
-            .withFlightSqlServerCancel(true)
-            .withFlightSqlServerStatementTimeout(42)
-            .withFlightSqlServerTransactionTimeout(7);
     sqlInfoBuilder.send(command.getInfoList(), listener);
   }
 
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java
index a294902a26d..451edb6bd5a 100644
--- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java
@@ -48,6 +48,7 @@ private Scenarios() {
     scenarios.put("poll_flight_info", PollFlightInfoScenario::new);
     scenarios.put("flight_sql", FlightSqlScenario::new);
     scenarios.put("flight_sql:extension", FlightSqlExtensionScenario::new);
+    scenarios.put("flight_sql:ingestion", FlightSqlIngestionScenario::new);
     scenarios.put("app_metadata_flight_info_endpoint", AppMetadataFlightInfoEndpointScenario::new);
     scenarios.put("session_options", SessionOptionsScenario::new);
   }
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/TestBufferAllocationListener.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/TestBufferAllocationListener.java
new file mode 100644
index 00000000000..10594d4cf09
--- /dev/null
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/TestBufferAllocationListener.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.flight.integration.tests;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.arrow.memory.AllocationListener;
+
+class TestBufferAllocationListener implements AllocationListener {
+  static class Entry {
+    StackTraceElement[] stackTrace;
+    long size;
+    boolean forAllocation;
+
+    public Entry(StackTraceElement[] stackTrace, long size, boolean forAllocation) {
+      this.stackTrace = stackTrace;
+      this.size = size;
+      this.forAllocation = forAllocation;
+    }
+  }
+
+  List<Entry> trail = new ArrayList<>();
+
+  public void onAllocation(long size) {
+    trail.add(new Entry(Thread.currentThread().getStackTrace(), size, true));
+  }
+
+  public void onRelease(long size) {
+    trail.add(new Entry(Thread.currentThread().getStackTrace(), size, false));
+  }
+
+  public void reThrowWithAddedAllocatorInfo(Exception e) {
+    StringBuilder sb = new StringBuilder();
+    sb.append(e.getMessage());
+    sb.append("\n");
+    sb.append("[[Buffer allocation and release trail during the test execution: \n");
+    for (Entry trailEntry : trail) {
+      sb.append(
+          String.format(
+              "%s: %d: %n%s",
+              trailEntry.forAllocation ? "allocate" : "release",
+              trailEntry.size,
+              getStackTraceAsString(trailEntry.stackTrace)));
+    }
+    sb.append("]]");
+    throw new IllegalStateException(sb.toString(), e);
+  }
+
+  private String getStackTraceAsString(StackTraceElement[] elements) {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 1; i < elements.length; i++) {
+      StackTraceElement s = elements[i];
+      sb.append("\t");
+      sb.append(s);
+      sb.append("\n");
+    }
+    return sb.toString();
+  }
+}
diff --git a/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java b/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java
index bdf1c43ce9d..8419432c662 100644
--- a/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java
+++ b/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java
@@ -16,6 +16,10 @@
  */
 package org.apache.arrow.flight.integration.tests;
 
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
 import org.apache.arrow.flight.FlightClient;
 import org.apache.arrow.flight.FlightServer;
 import org.apache.arrow.flight.Location;
@@ -80,6 +84,11 @@ void flightSqlExtension() throws Exception {
     testScenario("flight_sql:extension");
   }
 
+  @Test
+  void flightSqlIngestion() throws Exception {
+    testScenario("flight_sql:ingestion");
+  }
+
   @Test
   void appMetadataFlightInfoEndpoint() throws Exception {
     testScenario("app_metadata_flight_info_endpoint");
@@ -91,9 +100,16 @@ void sessionOptions() throws Exception {
   }
 
   void testScenario(String scenarioName) throws Exception {
-    try (final BufferAllocator allocator = new RootAllocator()) {
+    TestBufferAllocationListener listener = new TestBufferAllocationListener();
+    try (final BufferAllocator allocator = new RootAllocator(listener, Long.MAX_VALUE)) {
+      final ExecutorService exec =
+          Executors.newCachedThreadPool(
+              new ThreadFactoryBuilder()
+                  .setNameFormat("integration-test-flight-server-executor-%d")
+                  .build());
       final FlightServer.Builder builder =
           FlightServer.builder()
+              .executor(exec)
               .allocator(allocator)
               .location(Location.forGrpcInsecure("0.0.0.0", 0));
       final Scenario scenario = Scenarios.getScenario(scenarioName);
@@ -108,6 +124,17 @@ void testScenario(String scenarioName) throws Exception {
           scenario.client(allocator, location, client);
         }
       }
+
+      // Shutdown the executor while allowing existing tasks to finish.
+      // Without this wait, allocator.close() may get invoked earlier than an executor thread may
+      // have finished freeing up resources
+      // In that case, allocator.close() can throw an IllegalStateException for memory leak, leading
+      // to flaky tests
+      exec.shutdown();
+      final boolean unused = exec.awaitTermination(3, TimeUnit.SECONDS);
+    } catch (IllegalStateException e) {
+      // this could be due to Allocator detecting memory leak. Add allocation trail to help debug
+      listener.reThrowWithAddedAllocatorInfo(e);
     }
   }
 }
diff --git a/java/flight/flight-sql/pom.xml b/java/flight/flight-sql/pom.xml
index 92bab5e2067..021c1e65ab5 100644
--- a/java/flight/flight-sql/pom.xml
+++ b/java/flight/flight-sql/pom.xml
@@ -110,6 +110,12 @@ under the License.
       <version>2.12.0</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-text</artifactId>
+      <version>1.12.0</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.hamcrest</groupId>
       <artifactId>hamcrest</artifactId>
diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java
index 4bc12d86b1d..9a6ffdfdca8 100644
--- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java
+++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java
@@ -16,6 +16,7 @@
  */
 package org.apache.arrow.flight.sql;
 
+import static java.util.Objects.isNull;
 import static org.apache.arrow.flight.sql.impl.FlightSql.ActionBeginSavepointRequest;
 import static org.apache.arrow.flight.sql.impl.FlightSql.ActionBeginSavepointResult;
 import static org.apache.arrow.flight.sql.impl.FlightSql.ActionBeginTransactionRequest;
@@ -54,8 +55,10 @@
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 import java.util.Objects;
 import java.util.concurrent.ExecutionException;
+import java.util.function.Consumer;
 import java.util.stream.Collectors;
 import org.apache.arrow.flight.Action;
 import org.apache.arrow.flight.CallOption;
@@ -82,11 +85,14 @@
 import org.apache.arrow.flight.sql.impl.FlightSql;
 import org.apache.arrow.flight.sql.impl.FlightSql.ActionCreatePreparedStatementResult;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandPreparedStatementQuery;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions;
 import org.apache.arrow.flight.sql.util.TableRef;
 import org.apache.arrow.memory.ArrowBuf;
 import org.apache.arrow.util.AutoCloseables;
 import org.apache.arrow.util.Preconditions;
 import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.ipc.ArrowStreamReader;
 import org.apache.arrow.vector.ipc.ReadChannel;
 import org.apache.arrow.vector.ipc.message.MessageSerializer;
 import org.apache.arrow.vector.types.pojo.Schema;
@@ -206,6 +212,130 @@ public SchemaResult getExecuteSubstraitSchema(
     return getExecuteSubstraitSchema(substraitPlan, /*transaction*/ null, options);
   }
 
+  /**
+   * Execute a bulk ingest on the server.
+   *
+   * @param data data to be ingested
+   * @param ingestOptions options for the ingest request.
+   * @param options RPC-layer hints for this call.
+   * @return the number of rows affected.
+   */
+  public long executeIngest(
+      final VectorSchemaRoot data,
+      final ExecuteIngestOptions ingestOptions,
+      final CallOption... options) {
+    return executeIngest(data, ingestOptions, /*transaction*/ null, options);
+  }
+
+  /**
+   * Execute a bulk ingest on the server.
+   *
+   * @param dataReader data stream to be ingested
+   * @param ingestOptions options for the ingest request.
+   * @param options RPC-layer hints for this call.
+   * @return the number of rows affected.
+   */
+  public long executeIngest(
+      final ArrowStreamReader dataReader,
+      final ExecuteIngestOptions ingestOptions,
+      final CallOption... options) {
+    return executeIngest(dataReader, ingestOptions, /*transaction*/ null, options);
+  }
+
+  /**
+   * Execute a bulk ingest on the server.
+   *
+   * @param data data to be ingested
+   * @param ingestOptions options for the ingest request.
+   * @param transaction The transaction that this ingest request is part of.
+   * @param options RPC-layer hints for this call.
+   * @return the number of rows affected.
+   */
+  public long executeIngest(
+      final VectorSchemaRoot data,
+      final ExecuteIngestOptions ingestOptions,
+      Transaction transaction,
+      final CallOption... options) {
+    return executeIngest(
+        data, ingestOptions, transaction, FlightClient.ClientStreamListener::putNext, options);
+  }
+
+  /**
+   * Execute a bulk ingest on the server.
+   *
+   * @param dataReader data stream to be ingested
+   * @param ingestOptions options for the ingest request.
+   * @param transaction The transaction that this ingest request is part of.
+   * @param options RPC-layer hints for this call.
+   * @return the number of rows affected.
+   */
+  public long executeIngest(
+      final ArrowStreamReader dataReader,
+      final ExecuteIngestOptions ingestOptions,
+      Transaction transaction,
+      final CallOption... options) {
+
+    try {
+      return executeIngest(
+          dataReader.getVectorSchemaRoot(),
+          ingestOptions,
+          transaction,
+          listener -> {
+            while (true) {
+              try {
+                if (!dataReader.loadNextBatch()) {
+                  break;
+                }
+              } catch (IOException e) {
+                throw CallStatus.UNKNOWN.withCause(e).toRuntimeException();
+              }
+              listener.putNext();
+            }
+          },
+          options);
+    } catch (IOException e) {
+      throw CallStatus.UNKNOWN.withCause(e).toRuntimeException();
+    }
+  }
+
+  private long executeIngest(
+      final VectorSchemaRoot data,
+      final ExecuteIngestOptions ingestOptions,
+      final Transaction transaction,
+      final Consumer<FlightClient.ClientStreamListener> dataPutter,
+      final CallOption... options) {
+    try {
+      final CommandStatementIngest.Builder builder = CommandStatementIngest.newBuilder();
+      if (transaction != null) {
+        builder.setTransactionId(ByteString.copyFrom(transaction.getTransactionId()));
+      }
+      ingestOptions.updateCommandBuilder(builder);
+
+      final FlightDescriptor descriptor =
+          FlightDescriptor.command(Any.pack(builder.build()).toByteArray());
+      try (final SyncPutListener putListener = new SyncPutListener()) {
+
+        final FlightClient.ClientStreamListener listener =
+            client.startPut(descriptor, data, putListener, options);
+        dataPutter.accept(listener);
+        listener.completed();
+        listener.getResult();
+
+        try (final PutResult result = putListener.read()) {
+          final DoPutUpdateResult doPutUpdateResult =
+              DoPutUpdateResult.parseFrom(result.getApplicationMetadata().nioBuffer());
+          return doPutUpdateResult.getRecordCount();
+        }
+      }
+    } catch (final InterruptedException e) {
+      throw CallStatus.CANCELLED.withCause(e).toRuntimeException();
+    } catch (final ExecutionException e) {
+      throw CallStatus.CANCELLED.withCause(e.getCause()).toRuntimeException();
+    } catch (final InvalidProtocolBufferException e) {
+      throw CallStatus.INTERNAL.withCause(e).toRuntimeException();
+    }
+  }
+
   /**
    * Execute an update query on the server.
    *
@@ -245,8 +375,10 @@ public long executeUpdate(
       } finally {
         listener.getResult();
       }
-    } catch (final InterruptedException | ExecutionException e) {
+    } catch (final InterruptedException e) {
       throw CallStatus.CANCELLED.withCause(e).toRuntimeException();
+    } catch (final ExecutionException e) {
+      throw CallStatus.CANCELLED.withCause(e.getCause()).toRuntimeException();
     } catch (final InvalidProtocolBufferException e) {
       throw CallStatus.INTERNAL.withCause(e).toRuntimeException();
     }
@@ -295,8 +427,10 @@ public long executeSubstraitUpdate(
       } finally {
         listener.getResult();
       }
-    } catch (final InterruptedException | ExecutionException e) {
+    } catch (final InterruptedException e) {
       throw CallStatus.CANCELLED.withCause(e).toRuntimeException();
+    } catch (final ExecutionException e) {
+      throw CallStatus.CANCELLED.withCause(e.getCause()).toRuntimeException();
     } catch (final InvalidProtocolBufferException e) {
       throw CallStatus.INTERNAL.withCause(e).toRuntimeException();
     }
@@ -1003,6 +1137,82 @@ public void close() throws Exception {
     AutoCloseables.close(client);
   }
 
+  /** Class to encapsulate Flight SQL bulk ingest request options. * */
+  public static class ExecuteIngestOptions {
+    private final String table;
+    private final TableDefinitionOptions tableDefinitionOptions;
+    private final boolean useTemporaryTable;
+    private final String catalog;
+    private final String schema;
+    private final Map<String, String> options;
+
+    /**
+     * Constructor.
+     *
+     * @param table The table to load data into.
+     * @param tableDefinitionOptions The behavior for handling the table definition.
+     * @param catalog The catalog of the destination table to load data into. If null, a
+     *     backend-specific default may be used.
+     * @param schema The schema of the destination table to load data into. If null, a
+     *     backend-specific default may be used.
+     * @param options Backend-specific options. Can be null if there are no options to be set.
+     */
+    public ExecuteIngestOptions(
+        String table,
+        TableDefinitionOptions tableDefinitionOptions,
+        String catalog,
+        String schema,
+        Map<String, String> options) {
+      this(table, tableDefinitionOptions, false, catalog, schema, options);
+    }
+
+    /**
+     * Constructor.
+     *
+     * @param table The table to load data into.
+     * @param tableDefinitionOptions The behavior for handling the table definition.
+     * @param useTemporaryTable Use a temporary table for bulk ingestion. Temporary table may get
+     *     placed in a backend-specific schema and/or catalog and gets dropped at the end of the
+     *     session. If backend does not support ingesting using a temporary table or an explicit
+     *     choice of schema or catalog is incompatible with the server's namespacing decision, an
+     *     error is returned as part of {@link #executeIngest} request.
+     * @param catalog The catalog of the destination table to load data into. If null, a
+     *     backend-specific default may be used.
+     * @param schema The schema of the destination table to load data into. If null, a
+     *     backend-specific default may be used.
+     * @param options Backend-specific options. Can be null if there are no options to be set.
+     */
+    public ExecuteIngestOptions(
+        String table,
+        TableDefinitionOptions tableDefinitionOptions,
+        boolean useTemporaryTable,
+        String catalog,
+        String schema,
+        Map<String, String> options) {
+      this.table = table;
+      this.tableDefinitionOptions = tableDefinitionOptions;
+      this.useTemporaryTable = useTemporaryTable;
+      this.catalog = catalog;
+      this.schema = schema;
+      this.options = options;
+    }
+
+    protected void updateCommandBuilder(CommandStatementIngest.Builder builder) {
+      builder.setTable(table);
+      builder.setTableDefinitionOptions(tableDefinitionOptions);
+      builder.setTemporary(useTemporaryTable);
+      if (!isNull(catalog)) {
+        builder.setCatalog(catalog);
+      }
+      if (!isNull(schema)) {
+        builder.setSchema(schema);
+      }
+      if (!isNull(options)) {
+        builder.putAllOptions(options);
+      }
+    }
+  }
+
   /** Helper class to encapsulate Flight SQL prepared statement logic. */
   public static class PreparedStatement implements AutoCloseable {
     private final FlightClient client;
@@ -1140,10 +1350,12 @@ public FlightInfo execute(final CallOption... options) {
               }
             }
           }
-        } catch (final InterruptedException | ExecutionException e) {
+        } catch (final InterruptedException e) {
           throw CallStatus.CANCELLED.withCause(e).toRuntimeException();
+        } catch (final ExecutionException e) {
+          throw CallStatus.CANCELLED.withCause(e.getCause()).toRuntimeException();
         } catch (final InvalidProtocolBufferException e) {
-          throw CallStatus.INVALID_ARGUMENT.withCause(e).toRuntimeException();
+          throw CallStatus.INTERNAL.withCause(e).toRuntimeException();
         }
       }
 
@@ -1198,10 +1410,12 @@ public long executeUpdate(final CallOption... options) {
               DoPutUpdateResult.parseFrom(metadata.nioBuffer());
           return doPutUpdateResult.getRecordCount();
         }
-      } catch (final InterruptedException | ExecutionException e) {
+      } catch (final InterruptedException e) {
         throw CallStatus.CANCELLED.withCause(e).toRuntimeException();
+      } catch (final ExecutionException e) {
+        throw CallStatus.CANCELLED.withCause(e.getCause()).toRuntimeException();
       } catch (final InvalidProtocolBufferException e) {
-        throw CallStatus.INVALID_ARGUMENT.withCause(e).toRuntimeException();
+        throw CallStatus.INTERNAL.withCause(e).toRuntimeException();
       }
     }
 
diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java
index 0afef791606..9465e5ff880 100644
--- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java
+++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java
@@ -83,6 +83,7 @@
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandGetTables;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandPreparedStatementQuery;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandPreparedStatementUpdate;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementQuery;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementUpdate;
 import org.apache.arrow.flight.sql.impl.FlightSql.DoPutUpdateResult;
@@ -281,7 +282,8 @@ default void getStream(CallContext context, Ticket ticket, ServerStreamListener
   /**
    * Depending on the provided command, method either: 1. Execute provided SQL query as an update
    * statement, or 2. Execute provided update SQL query prepared statement. In this case, parameters
-   * binding is allowed, or 3. Binds parameters to the provided prepared statement.
+   * binding is allowed, or 3. Binds parameters to the provided prepared statement, or 4. Bulk
+   * ingests data provided through the flightStream.
    *
    * @param context Per-call context.
    * @param flightStream The data stream being uploaded.
@@ -299,6 +301,12 @@ default Runnable acceptPut(
           context,
           flightStream,
           ackStream);
+    } else if (command.is(CommandStatementIngest.class)) {
+      return acceptPutStatementBulkIngest(
+          FlightSqlUtils.unpackOrThrow(command, CommandStatementIngest.class),
+          context,
+          flightStream,
+          ackStream);
     } else if (command.is(CommandStatementSubstraitPlan.class)) {
       return acceptPutSubstraitPlan(
           FlightSqlUtils.unpackOrThrow(command, CommandStatementSubstraitPlan.class),
@@ -777,6 +785,27 @@ Runnable acceptPutStatement(
       FlightStream flightStream,
       StreamListener<PutResult> ackStream);
 
+  /**
+   * Accepts uploaded data for a particular bulk ingest data stream.
+   *
+   * <p>`PutResult`s must be in the form of a {@link DoPutUpdateResult}.
+   *
+   * @param command The bulk ingestion request.
+   * @param context Per-call context.
+   * @param flightStream The data stream being uploaded.
+   * @param ackStream The result data stream.
+   * @return A runnable to process the stream.
+   */
+  default Runnable acceptPutStatementBulkIngest(
+      CommandStatementIngest command,
+      CallContext context,
+      FlightStream flightStream,
+      StreamListener<PutResult> ackStream) {
+    return () -> {
+      ackStream.onError(CallStatus.UNIMPLEMENTED.toRuntimeException());
+    };
+  }
+
   /**
    * Handle a Substrait plan with uploaded data.
    *
diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/NoOpFlightSqlProducer.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/NoOpFlightSqlProducer.java
index 5091017c13c..72fcae8c180 100644
--- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/NoOpFlightSqlProducer.java
+++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/NoOpFlightSqlProducer.java
@@ -91,6 +91,18 @@ public Runnable acceptPutStatement(
     throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException();
   }
 
+  @Override
+  public Runnable acceptPutStatementBulkIngest(
+      FlightSql.CommandStatementIngest command,
+      CallContext context,
+      FlightStream flightStream,
+      StreamListener<PutResult> ackStream) {
+    return () -> {
+      ackStream.onError(
+          CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException());
+    };
+  }
+
   @Override
   public Runnable acceptPutPreparedStatementUpdate(
       FlightSql.CommandPreparedStatementUpdate command,
diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SqlInfoBuilder.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SqlInfoBuilder.java
index 2a31bc77365..cbe4989d147 100644
--- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SqlInfoBuilder.java
+++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SqlInfoBuilder.java
@@ -148,6 +148,17 @@ public SqlInfoBuilder withFlightSqlServerCancel(boolean value) {
     return withBooleanProvider(SqlInfo.FLIGHT_SQL_SERVER_CANCEL_VALUE, value);
   }
 
+  /** Set a value for bulk ingestion support. */
+  public SqlInfoBuilder withFlightSqlServerBulkIngestion(boolean value) {
+    return withBooleanProvider(SqlInfo.FLIGHT_SQL_SERVER_BULK_INGESTION_VALUE, value);
+  }
+
+  /** Set a value for transaction support for bulk ingestion. */
+  public SqlInfoBuilder withFlightSqlServerBulkIngestionTransaction(boolean value) {
+    return withBooleanProvider(
+        SqlInfo.FLIGHT_SQL_SERVER_INGEST_TRANSACTIONS_SUPPORTED_VALUE, value);
+  }
+
   /** Set a value for statement timeouts. */
   public SqlInfoBuilder withFlightSqlServerStatementTimeout(int value) {
     return withIntProvider(SqlInfo.FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT_VALUE, value);
diff --git a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
index 67bfc85c486..f9d0551a3aa 100644
--- a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
+++ b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
@@ -55,6 +55,7 @@
 import java.nio.file.NoSuchFileException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
 import java.sql.Connection;
 import java.sql.DatabaseMetaData;
 import java.sql.DriverManager;
@@ -82,6 +83,7 @@
 import java.util.function.BiConsumer;
 import java.util.function.Consumer;
 import java.util.function.Predicate;
+import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.arrow.adapter.jdbc.ArrowVectorIterator;
 import org.apache.arrow.adapter.jdbc.JdbcFieldInfo;
@@ -112,6 +114,10 @@
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandGetTables;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandPreparedStatementQuery;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandPreparedStatementUpdate;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableExistsOption;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableNotExistOption;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementQuery;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementUpdate;
 import org.apache.arrow.flight.sql.impl.FlightSql.SqlSupportedCaseSensitivity;
@@ -146,6 +152,7 @@
 import org.apache.commons.dbcp2.PoolingDataSource;
 import org.apache.commons.pool2.ObjectPool;
 import org.apache.commons.pool2.impl.GenericObjectPool;
+import org.apache.commons.text.StringEscapeUtils;
 import org.slf4j.Logger;
 
 /**
@@ -245,7 +252,9 @@ public FlightSqlExample(final Location location, final String dbName) {
                           : SqlSupportedCaseSensitivity.SQL_CASE_SENSITIVITY_UNKNOWN)
           .withSqlAllTablesAreSelectable(true)
           .withSqlNullOrdering(SqlNullOrdering.SQL_NULLS_SORTED_AT_END)
-          .withSqlMaxColumnsInTable(42);
+          .withSqlMaxColumnsInTable(42)
+          .withFlightSqlServerBulkIngestion(true)
+          .withFlightSqlServerBulkIngestionTransaction(false);
     } catch (SQLException e) {
       throw new RuntimeException(e);
     }
@@ -714,6 +723,34 @@ private static ByteBuffer serializeMetadata(final Schema schema) {
     }
   }
 
+  private static String getRootAsCSVNoHeader(final VectorSchemaRoot root) {
+    StringBuilder sb = new StringBuilder();
+    Schema schema = root.getSchema();
+    int rowCount = root.getRowCount();
+    List<FieldVector> fieldVectors = root.getFieldVectors();
+
+    List<Object> row = new ArrayList<>(schema.getFields().size());
+    for (int i = 0; i < rowCount; i++) {
+      if (i > 0) {
+        sb.append("\n");
+      }
+      row.clear();
+      for (FieldVector v : fieldVectors) {
+        row.add(v.getObject(i));
+      }
+      printRowAsCSV(sb, row);
+    }
+    return sb.toString();
+  }
+
+  private static void printRowAsCSV(StringBuilder sb, List<Object> values) {
+    sb.append(
+        values.stream()
+            .map(v -> isNull(v) ? "" : v.toString())
+            .map(StringEscapeUtils::escapeCsv)
+            .collect(Collectors.joining(",")));
+  }
+
   @Override
   public void getStreamPreparedStatement(
       final CommandPreparedStatementQuery command,
@@ -951,6 +988,138 @@ public Runnable acceptPutStatement(
     };
   }
 
+  @Override
+  public Runnable acceptPutStatementBulkIngest(
+      CommandStatementIngest command,
+      CallContext context,
+      FlightStream flightStream,
+      StreamListener<PutResult> ackStream) {
+
+    final String schema = command.hasSchema() ? command.getSchema() : null;
+    final String table = command.getTable();
+    final boolean temporary = command.getTemporary();
+    final boolean transactionId = command.hasTransactionId();
+    final TableDefinitionOptions tableDefinitionOptions =
+        command.hasTableDefinitionOptions() ? command.getTableDefinitionOptions() : null;
+
+    return () -> {
+      TableExistsOption ifExists = TableExistsOption.TABLE_EXISTS_OPTION_APPEND;
+      if (temporary) {
+        ackStream.onError(
+            CallStatus.UNIMPLEMENTED
+                .withDescription("Bulk ingestion using temporary tables is not supported")
+                .toRuntimeException());
+      } else if (transactionId) {
+        ackStream.onError(
+            CallStatus.UNIMPLEMENTED
+                .withDescription(
+                    "Bulk ingestion automatically happens in a transaction. Specifying explicit transaction is not supported.")
+                .toRuntimeException());
+      } else if (isNull(tableDefinitionOptions)) {
+        ackStream.onError(
+            CallStatus.INVALID_ARGUMENT
+                .withDescription("TableDefinitionOptions not provided.")
+                .toRuntimeException());
+      } else {
+        TableNotExistOption ifNotExist = tableDefinitionOptions.getIfNotExist();
+        ifExists = tableDefinitionOptions.getIfExists();
+
+        if (!TableNotExistOption.TABLE_NOT_EXIST_OPTION_FAIL.equals(ifNotExist)) {
+          ackStream.onError(
+              CallStatus.UNIMPLEMENTED
+                  .withDescription(
+                      "Only supported option is TABLE_NOT_EXIST_OPTION_FAIL for TableNotExistsOption.")
+                  .toRuntimeException());
+        } else if (TableExistsOption.TABLE_EXISTS_OPTION_UNSPECIFIED.equals(ifExists)) {
+          ackStream.onError(
+              CallStatus.INVALID_ARGUMENT
+                  .withDescription("TableExistsOption must be specified")
+                  .toRuntimeException());
+        } else if (TableExistsOption.TABLE_EXISTS_OPTION_FAIL.equals(ifExists)) {
+          ackStream.onError(
+              CallStatus.UNIMPLEMENTED
+                  .withDescription("TABLE_EXISTS_OPTION_FAIL is not supported.")
+                  .toRuntimeException());
+        }
+      }
+
+      Path tempFile = null;
+      try {
+        tempFile = Files.createTempFile(null, null);
+
+        VectorSchemaRoot root = null;
+        int counter = 0;
+        while (flightStream.next()) {
+          if (counter > 0) {
+            Files.writeString(tempFile, "\n", StandardCharsets.UTF_8, StandardOpenOption.APPEND);
+          }
+          counter += 1;
+          root = flightStream.getRoot();
+          Files.writeString(
+              tempFile,
+              getRootAsCSVNoHeader(root),
+              StandardCharsets.UTF_8,
+              StandardOpenOption.APPEND);
+        }
+
+        if (counter > 0) {
+          Files.writeString(tempFile, "\n", StandardCharsets.UTF_8, StandardOpenOption.APPEND);
+        }
+
+        if (!isNull(root)) {
+          String header =
+              root.getSchema().getFields().stream()
+                  .map(Field::getName)
+                  .collect(Collectors.joining(","));
+
+          try (final Connection connection = dataSource.getConnection();
+              final PreparedStatement preparedStatement =
+                  connection.prepareStatement(
+                      "CALL SYSCS_UTIL.SYSCS_IMPORT_DATA (?,?,?,null,?,?,?,?,?)")) {
+
+            preparedStatement.setString(1, schema);
+            preparedStatement.setString(2, table);
+            preparedStatement.setString(3, header);
+            preparedStatement.setString(4, tempFile.toString());
+            preparedStatement.setString(5, ",");
+            preparedStatement.setString(6, "\"");
+            preparedStatement.setString(7, "UTF-8");
+            preparedStatement.setInt(
+                8, TableExistsOption.TABLE_EXISTS_OPTION_REPLACE.equals(ifExists) ? 1 : 0);
+            preparedStatement.execute();
+
+            final DoPutUpdateResult build =
+                DoPutUpdateResult.newBuilder().setRecordCount(-1).build();
+
+            try (final ArrowBuf buffer = rootAllocator.buffer(build.getSerializedSize())) {
+              buffer.writeBytes(build.toByteArray());
+              ackStream.onNext(PutResult.metadata(buffer));
+              ackStream.onCompleted();
+            }
+          } catch (SQLException e) {
+            ackStream.onError(
+                CallStatus.INTERNAL
+                    .withDescription("Failed to execute bulk ingest: " + e)
+                    .toRuntimeException());
+          }
+        }
+      } catch (IOException e) {
+        ackStream.onError(
+            CallStatus.INTERNAL
+                .withDescription("Failed to create temp file for bulk loading: " + e)
+                .toRuntimeException());
+      } finally {
+        if (!isNull(tempFile)) {
+          try {
+            Files.delete(tempFile);
+          } catch (IOException e) {
+            //
+          }
+        }
+      }
+    };
+  }
+
   @Override
   public Runnable acceptPutPreparedStatementUpdate(
       CommandPreparedStatementUpdate command,
diff --git a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/test/TestFlightSql.java b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/test/TestFlightSql.java
index 2eb74adc5bc..3f769363fb6 100644
--- a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/test/TestFlightSql.java
+++ b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/test/TestFlightSql.java
@@ -30,6 +30,10 @@
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
 import com.google.common.collect.ImmutableList;
+import java.io.IOException;
+import java.io.PipedInputStream;
+import java.io.PipedOutputStream;
+import java.nio.charset.StandardCharsets;
 import java.sql.SQLException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -53,6 +57,9 @@
 import org.apache.arrow.flight.sql.FlightSqlProducer;
 import org.apache.arrow.flight.sql.example.FlightSqlExample;
 import org.apache.arrow.flight.sql.impl.FlightSql;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableExistsOption;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableNotExistOption;
 import org.apache.arrow.flight.sql.impl.FlightSql.SqlSupportedCaseSensitivity;
 import org.apache.arrow.flight.sql.util.TableRef;
 import org.apache.arrow.memory.BufferAllocator;
@@ -60,11 +67,15 @@
 import org.apache.arrow.vector.IntVector;
 import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.ipc.ArrowStreamReader;
+import org.apache.arrow.vector.ipc.ArrowStreamWriter;
 import org.apache.arrow.vector.types.Types.MinorType;
+import org.apache.arrow.vector.types.pojo.ArrowType;
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.arrow.vector.types.pojo.FieldType;
 import org.apache.arrow.vector.types.pojo.Schema;
 import org.apache.arrow.vector.util.Text;
+import org.apache.arrow.vector.util.VectorBatchAppender;
 import org.hamcrest.Matcher;
 import org.hamcrest.MatcherAssert;
 import org.junit.jupiter.api.AfterAll;
@@ -96,6 +107,43 @@ public class TestFlightSql {
   protected static FlightServer server;
   protected static FlightSqlClient sqlClient;
 
+  private static void populateNext10RowsInIngestRootBatch(
+      int startRowNumber,
+      IntVector valueVector,
+      VarCharVector keyNameVector,
+      IntVector foreignIdVector,
+      VarCharVector keyNamesToBeDeletedVector,
+      VectorSchemaRoot ingestRoot) {
+
+    final int NumRowsInBatch = 10;
+
+    valueVector.reset();
+    keyNameVector.reset();
+    foreignIdVector.reset();
+
+    final IntStream range = IntStream.range(1, NumRowsInBatch);
+
+    range.forEach(
+        i -> {
+          valueVector.setSafe(i - 1, (i + startRowNumber - 1) * NumRowsInBatch);
+          keyNameVector.setSafe(i - 1, new Text("value" + (i + startRowNumber - 1)));
+          foreignIdVector.setSafe(i - 1, 1);
+        });
+    // put some comma and double-quote containing string as well
+    valueVector.setSafe(NumRowsInBatch - 1, (NumRowsInBatch + startRowNumber - 1) * NumRowsInBatch);
+    keyNameVector.setSafe(
+        NumRowsInBatch - 1,
+        new Text(
+            String.format(
+                "value%d, is \"%d\"",
+                (NumRowsInBatch + startRowNumber - 1),
+                (NumRowsInBatch + startRowNumber - 1) * NumRowsInBatch)));
+    foreignIdVector.setSafe(NumRowsInBatch - 1, 1);
+    ingestRoot.setRowCount(NumRowsInBatch);
+
+    VectorBatchAppender.batchAppend(keyNamesToBeDeletedVector, keyNameVector);
+  }
+
   @BeforeAll
   public static void setUp() throws Exception {
     setUpClientServer();
@@ -537,6 +585,119 @@ public void testSimplePreparedStatementUpdateResults() throws SQLException {
     }
   }
 
+  @Test
+  public void testBulkIngest() throws IOException {
+    // For bulk ingest DerbyDB requires uppercase column names
+    var keyName = new Field("KEYNAME", FieldType.nullable(new ArrowType.Utf8()), null);
+    var value = new Field("VALUE", FieldType.nullable(new ArrowType.Int(32, true)), null);
+    var foreignId = new Field("FOREIGNID", FieldType.nullable(new ArrowType.Int(32, true)), null);
+
+    Schema dataSchema = new Schema(List.of(keyName, value, foreignId));
+
+    try (final VectorSchemaRoot ingestRoot = VectorSchemaRoot.create(dataSchema, allocator);
+        final VarCharVector keyNamesToBeDeletedVector = new VarCharVector(keyName, allocator)) {
+      final VarCharVector keyNameVector = (VarCharVector) ingestRoot.getVector(0);
+      final IntVector valueVector = (IntVector) ingestRoot.getVector(1);
+      final IntVector foreignIdVector = (IntVector) ingestRoot.getVector(2);
+      ingestRoot.allocateNew();
+      keyNamesToBeDeletedVector.allocateNew();
+
+      try (PipedInputStream inPipe = new PipedInputStream(1024);
+          PipedOutputStream outPipe = new PipedOutputStream(inPipe);
+          ArrowStreamReader reader = new ArrowStreamReader(inPipe, allocator)) {
+
+        new Thread(
+                () -> {
+                  try (ArrowStreamWriter writer =
+                      new ArrowStreamWriter(ingestRoot, null, outPipe)) {
+                    writer.start();
+                    populateNext10RowsInIngestRootBatch(
+                        1,
+                        valueVector,
+                        keyNameVector,
+                        foreignIdVector,
+                        keyNamesToBeDeletedVector,
+                        ingestRoot);
+                    writer.writeBatch();
+                    populateNext10RowsInIngestRootBatch(
+                        11,
+                        valueVector,
+                        keyNameVector,
+                        foreignIdVector,
+                        keyNamesToBeDeletedVector,
+                        ingestRoot);
+                    writer.writeBatch();
+                  } catch (Exception e) {
+                    throw new RuntimeException(e);
+                  }
+                })
+            .start();
+
+        // Ingest from a stream
+        final long updatedRows =
+            sqlClient.executeIngest(
+                reader,
+                new FlightSqlClient.ExecuteIngestOptions(
+                    "INTTABLE",
+                    TableDefinitionOptions.newBuilder()
+                        .setIfExists(TableExistsOption.TABLE_EXISTS_OPTION_APPEND)
+                        .setIfNotExist(TableNotExistOption.TABLE_NOT_EXIST_OPTION_FAIL)
+                        .build(),
+                    null,
+                    null,
+                    null));
+
+        MatcherAssert.assertThat(updatedRows, is(-1L));
+
+        // Ingest directly using VectorSchemaRoot
+        populateNext10RowsInIngestRootBatch(
+            21, valueVector, keyNameVector, foreignIdVector, keyNamesToBeDeletedVector, ingestRoot);
+        sqlClient.executeIngest(
+            ingestRoot,
+            new FlightSqlClient.ExecuteIngestOptions(
+                "INTTABLE",
+                TableDefinitionOptions.newBuilder()
+                    .setIfExists(TableExistsOption.TABLE_EXISTS_OPTION_APPEND)
+                    .setIfNotExist(TableNotExistOption.TABLE_NOT_EXIST_OPTION_FAIL)
+                    .build(),
+                null,
+                null,
+                null));
+
+        try (PreparedStatement deletePrepare =
+            sqlClient.prepare("DELETE FROM INTTABLE WHERE keyName = ?")) {
+          final long deletedRows;
+          try (final VectorSchemaRoot deleteRoot = VectorSchemaRoot.of(keyNamesToBeDeletedVector)) {
+            deletePrepare.setParameters(deleteRoot);
+            deletedRows = deletePrepare.executeUpdate();
+          }
+
+          MatcherAssert.assertThat(deletedRows, is(30L));
+        }
+      }
+    }
+  }
+
+  @Test
+  public void testBulkIngestTransaction() {
+    assertThrows(
+        RuntimeException.class,
+        () -> {
+          sqlClient.executeIngest(
+              VectorSchemaRoot.create(new Schema(List.of()), allocator),
+              new FlightSqlClient.ExecuteIngestOptions(
+                  "INTTABLE",
+                  TableDefinitionOptions.newBuilder()
+                      .setIfExists(TableExistsOption.TABLE_EXISTS_OPTION_APPEND)
+                      .setIfNotExist(TableNotExistOption.TABLE_NOT_EXIST_OPTION_FAIL)
+                      .build(),
+                  null,
+                  null,
+                  null),
+              new FlightSqlClient.Transaction("123".getBytes(StandardCharsets.UTF_8)));
+        });
+  }
+
   @Test
   public void testSimplePreparedStatementUpdateResultsWithoutParameters() throws SQLException {
     try (PreparedStatement prepare =

From 8113594c85b4c578236cdd69d4155e0e118744b4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 5 Sep 2024 09:54:00 +0900
Subject: [PATCH 136/157] GH-43952: [CI] Bump
 actions/{upload|download}-artifact from 3 to latest v4 in /.github/workflows
 (#43940)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 3 to 4.1.7.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/actions/download-artifact/releases">actions/download-artifact's releases</a>.</em></p>
<blockquote>
<h2>v4.1.7</h2>
<h2>What's Changed</h2>
<ul>
<li>Update <code>@​actions/artifact</code> dependency by <a href="https://github.com/bethanyj28"><code>@​bethanyj28</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/325">actions/download-artifact#325</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/download-artifact/compare/v4.1.6...v4.1.7">https://github.com/actions/download-artifact/compare/v4.1.6...v4.1.7</a></p>
<h2>v4.1.6</h2>
<h2>What's Changed</h2>
<ul>
<li>updating <code>@ actions/artifact</code> dependency to v2.1.6 by <a href="https://github.com/eggyhead"><code>@​eggyhead</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/324">actions/download-artifact#324</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/download-artifact/compare/v4.1.5...v4.1.6">https://github.com/actions/download-artifact/compare/v4.1.5...v4.1.6</a></p>
<h2>v4.1.5</h2>
<h2>What's Changed</h2>
<ul>
<li>Update readme with v3/v2/v1 deprecation notice by <a href="https://github.com/robherley"><code>@​robherley</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/322">actions/download-artifact#322</a></li>
<li>Update dependencies <code>@ actions/core</code> to v1.10.1 and <code>@ actions/artifact</code> to v2.1.5</li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/download-artifact/compare/v4.1.4...v4.1.5">https://github.com/actions/download-artifact/compare/v4.1.4...v4.1.5</a></p>
<h2>v4.1.4</h2>
<h2>What's Changed</h2>
<ul>
<li>Update <code>@​actions/artifact</code> by <a href="https://github.com/bethanyj28"><code>@​bethanyj28</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/307">actions/download-artifact#307</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/download-artifact/compare/v4...v4.1.4">https://github.com/actions/download-artifact/compare/v4...v4.1.4</a></p>
<h2>v4.1.3</h2>
<h2>What's Changed</h2>
<ul>
<li>Update release-new-action-version.yml by <a href="https://github.com/konradpabjan"><code>@​konradpabjan</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/292">actions/download-artifact#292</a></li>
<li>Update toolkit dependency with updated unzip logic by <a href="https://github.com/bethanyj28"><code>@​bethanyj28</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/299">actions/download-artifact#299</a></li>
<li>Update <code>@​actions/artifact</code> by <a href="https://github.com/bethanyj28"><code>@​bethanyj28</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/303">actions/download-artifact#303</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/bethanyj28"><code>@​bethanyj28</code></a> made their first contribution in <a href="https://redirect.github.com/actions/download-artifact/pull/299">actions/download-artifact#299</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/download-artifact/compare/v4...v4.1.3">https://github.com/actions/download-artifact/compare/v4...v4.1.3</a></p>
<h2>v4.1.2</h2>
<ul>
<li>Bump <code>@​actions/artifacts</code> to latest version to include <a href="https://redirect.github.com/actions/toolkit/pull/1648">updated GHES host check</a></li>
</ul>
<h2>v4.1.1</h2>
<ul>
<li>Fix transient request timeouts <a href="https://redirect.github.com/actions/download-artifact/issues/249">actions/download-artifact#249</a></li>
<li>Bump <code>@ actions/artifacts</code> to latest version</li>
</ul>
<h2>v4.1.0</h2>
<h2>What's Changed</h2>
<ul>
<li>Some cleanup by <a href="https://github.com/robherley"><code>@​robherley</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/247">actions/download-artifact#247</a></li>
<li>Fix default for run-id by <a href="https://github.com/stchr"><code>@​stchr</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/252">actions/download-artifact#252</a></li>
</ul>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/actions/download-artifact/commit/65a9edc5881444af0b9093a5e628f2fe47ea3b2e"><code>65a9edc</code></a> Merge pull request <a href="https://redirect.github.com/actions/download-artifact/issues/325">#325</a> from bethanyj28/main</li>
<li><a href="https://github.com/actions/download-artifact/commit/fdd1595981c1a29187d3de99c28c28a166bc38f7"><code>fdd1595</code></a> licensed</li>
<li><a href="https://github.com/actions/download-artifact/commit/c13dba102f4bb92b3f679fa086db9e2973960ca7"><code>c13dba1</code></a> update <code>@​actions/artifact</code> dependency</li>
<li><a href="https://github.com/actions/download-artifact/commit/0daa75ebeac4617faeb127496dbd716b8bcce26e"><code>0daa75e</code></a> Merge pull request <a href="https://redirect.github.com/actions/download-artifact/issues/324">#324</a> from actions/eggyhead/use-artifact-v2.1.6</li>
<li><a href="https://github.com/actions/download-artifact/commit/9c19ed7fe5d278cd354c7dfd5d3b88589c7e2395"><code>9c19ed7</code></a> Merge branch 'main' into eggyhead/use-artifact-v2.1.6</li>
<li><a href="https://github.com/actions/download-artifact/commit/3d3ea8741ef44e86f7392b41e391bde3c36219bd"><code>3d3ea87</code></a> updating license</li>
<li><a href="https://github.com/actions/download-artifact/commit/89af5db8211998d3ca691103a86b0b9362a94286"><code>89af5db</code></a> updating artifact package v2.1.6</li>
<li><a href="https://github.com/actions/download-artifact/commit/b4aefff88e83a2676a730654e1ce3dce61880379"><code>b4aefff</code></a> Merge pull request <a href="https://redirect.github.com/actions/download-artifact/issues/323">#323</a> from actions/eggyhead/update-artifact-v215</li>
<li><a href="https://github.com/actions/download-artifact/commit/8caf195ad4b1dee92908e23f56eeb0696f1dd42d"><code>8caf195</code></a> package lock update</li>
<li><a href="https://github.com/actions/download-artifact/commit/d7a2ec411d177e8ca679ac5969b70be59c322700"><code>d7a2ec4</code></a> updating package version</li>
<li>Additional commits viewable in <a href="https://github.com/actions/download-artifact/compare/v3...v4.1.7">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/download-artifact&package-manager=github_actions&previous-version=3&new-version=4.1.7)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apache/arrow/network/alerts).

</details>
* GitHub Issue: #43952

Lead-authored-by: Jacob Wujciak-Jens <jacob@wujciak.de>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/pr_review_trigger.yml |  2 +-
 .github/workflows/r.yml                 | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pr_review_trigger.yml b/.github/workflows/pr_review_trigger.yml
index 0cd89b32067..68f922ce8b4 100644
--- a/.github/workflows/pr_review_trigger.yml
+++ b/.github/workflows/pr_review_trigger.yml
@@ -29,7 +29,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Upload PR review Payload"
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4.4.0
         with:
           path: "${{ github.event_path }}"
           name: "pr_review_payload"
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 21afa4586b5..7fc45777bf6 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -169,9 +169,9 @@ jobs:
         if: always()
       - name: Save the test output
         if: always()
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
         with:
-          name: test-output
+          name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }}
           path: r/check/arrow.Rcheck/tests/testthat.Rout*
       - name: Docker Push
         if: >-
@@ -230,9 +230,9 @@ jobs:
         if: always()
       - name: Save the test output
         if: always()
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
         with:
-          name: test-output
+          name: test-output-bundled
           path: r/check/arrow.Rcheck/tests/testthat.Rout*
       - name: Docker Push
         if: >-
@@ -292,7 +292,7 @@ jobs:
         # So that they're unique when multiple are downloaded in the next step
         shell: bash
         run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # # v4.0.0
         with:
           name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
           path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
@@ -330,7 +330,7 @@ jobs:
           echo "$HOME/.local/bin" >> $GITHUB_PATH
       - run: mkdir r/windows
       - name: Download artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4.1.7
         with:
           name: libarrow-rtools40-ucrt64.zip
           path: r/windows

From f545b90748d5196af547abcec19d63a7b14e4daa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Thu, 5 Sep 2024 09:56:19 +0200
Subject: [PATCH 137/157] GH-43299: [Release][Packaging] Only include pyarrow
 folder  when finding packages on setuptools (#43325)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

Currently we include everything when building wheels, see:
```
$ pip install pyarrow
Collecting pyarrow
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.9/39.9 MB 33.8 MB/s eta 0:00:00
Collecting numpy>=1.16.6
  Using cached numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.3 MB)
Installing collected packages: numpy, pyarrow
Successfully installed numpy-2.0.0 pyarrow-17.0.0
(test-env)  $ ls test-env/lib/python3.10/site-packages/
benchmarks/                  distutils-precedence.pth     numpy-2.0.0.dist-info/       pip-22.0.2.dist-info/        pyarrow-17.0.0.dist-info/    setuptools-59.6.0.dist-info/
cmake_modules/               examples/                    numpy.libs/                  pkg_resources/               scripts/
_distutils_hack/             numpy/                       pip/                         pyarrow/                     setuptools/
```

### What changes are included in this PR?

Use `include` as seen here: https://setuptools.pypa.io/en/latest/userguide/package_discovery.html#finding-simple-packages

### Are these changes tested?

Will check via the build wheel on CI

### Are there any user-facing changes?

No and yes :)
We will remove unnecessary files
* GitHub Issue: #43299

Lead-authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 ci/docker/python-wheel-manylinux.dockerfile  |  3 ++
 ci/scripts/python_wheel_macos_build.sh       |  1 -
 ci/scripts/python_wheel_manylinux_build.sh   |  3 +-
 ci/scripts/python_wheel_unix_test.sh         |  6 +++
 ci/scripts/python_wheel_validate_contents.py | 48 ++++++++++++++++++++
 ci/scripts/python_wheel_windows_build.bat    |  1 -
 ci/scripts/python_wheel_windows_test.bat     |  3 ++
 docker-compose.yml                           |  2 +
 docs/source/developers/python.rst            |  3 --
 python/pyproject.toml                        |  3 +-
 python/setup.py                              | 16 +------
 11 files changed, 66 insertions(+), 23 deletions(-)
 create mode 100644 ci/scripts/python_wheel_validate_contents.py

diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile
index 42f088fd8a2..5cc1711608c 100644
--- a/ci/docker/python-wheel-manylinux.dockerfile
+++ b/ci/docker/python-wheel-manylinux.dockerfile
@@ -100,6 +100,9 @@ RUN vcpkg install \
         --x-feature=parquet \
         --x-feature=s3
 
+# Make sure auditwheel is up-to-date
+RUN pipx upgrade auditwheel
+
 # Configure Python for applications running in the bash shell of this Dockerfile
 ARG python=3.8
 ENV PYTHON_VERSION=${python}
diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh
index 92b962f1740..d2c392e6b9d 100755
--- a/ci/scripts/python_wheel_macos_build.sh
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -150,7 +150,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ==="
 export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE}
 export PYARROW_BUNDLE_ARROW_CPP=1
 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR}
-export PYARROW_INSTALL_TESTS=1
 export PYARROW_WITH_ACERO=${ARROW_ACERO}
 export PYARROW_WITH_AZURE=${ARROW_AZURE}
 export PYARROW_WITH_DATASET=${ARROW_DATASET}
diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh
index aa86494a9d4..885019ff304 100755
--- a/ci/scripts/python_wheel_manylinux_build.sh
+++ b/ci/scripts/python_wheel_manylinux_build.sh
@@ -140,7 +140,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ==="
 export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE}
 export PYARROW_BUNDLE_ARROW_CPP=1
 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR}
-export PYARROW_INSTALL_TESTS=1
 export PYARROW_WITH_ACERO=${ARROW_ACERO}
 export PYARROW_WITH_AZURE=${ARROW_AZURE}
 export PYARROW_WITH_DATASET=${ARROW_DATASET}
@@ -181,5 +180,5 @@ popd
 rm -rf dist/temp-fix-wheel
 
 echo "=== (${PYTHON_VERSION}) Tag the wheel with manylinux${MANYLINUX_VERSION} ==="
-auditwheel repair -L . dist/pyarrow-*.whl -w repaired_wheels
+auditwheel repair dist/pyarrow-*.whl -w repaired_wheels
 popd
diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh
index cf87a170567..6bdc3d3621e 100755
--- a/ci/scripts/python_wheel_unix_test.sh
+++ b/ci/scripts/python_wheel_unix_test.sh
@@ -34,6 +34,7 @@ source_dir=${1}
 : ${ARROW_S3:=ON}
 : ${ARROW_SUBSTRAIT:=ON}
 : ${CHECK_IMPORTS:=ON}
+: ${CHECK_WHEEL_CONTENT:=ON}
 : ${CHECK_UNITTESTS:=ON}
 : ${INSTALL_PYARROW:=ON}
 
@@ -87,6 +88,11 @@ import pyarrow.parquet
   fi
 fi
 
+if [ "${CHECK_WHEEL_CONTENT}" == "ON" ]; then
+  python ${source_dir}/ci/scripts/python_wheel_validate_contents.py \
+    --path ${source_dir}/python/repaired_wheels
+fi
+
 if [ "${CHECK_UNITTESTS}" == "ON" ]; then
   # Install testing dependencies
   pip install -U -r ${source_dir}/python/requirements-wheel-test.txt
diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py
new file mode 100644
index 00000000000..22b3a890f03
--- /dev/null
+++ b/ci/scripts/python_wheel_validate_contents.py
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+from pathlib import Path
+import re
+import zipfile
+
+
+def validate_wheel(path):
+    p = Path(path)
+    wheels = list(p.glob('*.whl'))
+    error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})"
+    assert len(wheels) == 1, error_msg
+    f = zipfile.ZipFile(wheels[0])
+    outliers = [
+        info.filename for info in f.filelist if not re.match(
+            r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/)', info.filename
+        )
+    ]
+    assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}"
+    print(f"The wheel: {wheels[0]} seems valid.")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--path", type=str, required=True,
+                        help="Directory where wheel is located")
+    args = parser.parse_args()
+    validate_wheel(args.path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat
index 54f02ec6f6e..1f1d5dca721 100644
--- a/ci/scripts/python_wheel_windows_build.bat
+++ b/ci/scripts/python_wheel_windows_build.bat
@@ -106,7 +106,6 @@ echo "=== (%PYTHON_VERSION%) Building wheel ==="
 set PYARROW_BUILD_TYPE=%CMAKE_BUILD_TYPE%
 set PYARROW_BUNDLE_ARROW_CPP=ON
 set PYARROW_CMAKE_GENERATOR=%CMAKE_GENERATOR%
-set PYARROW_INSTALL_TESTS=ON
 set PYARROW_WITH_ACERO=%ARROW_ACERO%
 set PYARROW_WITH_DATASET=%ARROW_DATASET%
 set PYARROW_WITH_FLIGHT=%ARROW_FLIGHT%
diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat
index cac3f18434b..de5a2c2e965 100755
--- a/ci/scripts/python_wheel_windows_test.bat
+++ b/ci/scripts/python_wheel_windows_test.bat
@@ -64,6 +64,9 @@ set PYTHON_CMD=py -%PYTHON%
 %PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1
 %PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1
 
+@REM Validate wheel contents
+%PYTHON_CMD% C:\arrow\ci\scripts\python_wheel_validate_contents.py --path C:\arrow\python\dist || exit /B 1
+
 @rem Download IANA Timezone Database for ORC C++
 curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B
 mkdir %USERPROFILE%\Downloads\test\tzdata
diff --git a/docker-compose.yml b/docker-compose.yml
index 19a9dd0de39..36cf150f25f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1144,6 +1144,7 @@ services:
       <<: *common
       CHECK_IMPORTS: "ON"
       CHECK_UNITTESTS: "OFF"
+      CHECK_WHEEL_CONTENT: "ON"
     command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow
 
   python-wheel-manylinux-test-unittests:
@@ -1164,6 +1165,7 @@ services:
       <<: *common
       CHECK_IMPORTS: "OFF"
       CHECK_UNITTESTS: "ON"
+      CHECK_WHEEL_CONTENT: "OFF"
     command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow
 
   python-wheel-windows-vs2019:
diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst
index 6beea55e66b..2ba4b534cae 100644
--- a/docs/source/developers/python.rst
+++ b/docs/source/developers/python.rst
@@ -632,9 +632,6 @@ PyArrow are:
    * - ``PYARROW_BUNDLE_CYTHON_CPP``
      - Bundle the C++ files generated by Cython
      - ``0`` (``OFF``)
-   * - ``PYARROW_INSTALL_TESTS``
-     - Add the test to the python package
-     - ``1`` (``ON``)
    * - ``PYARROW_BUILD_VERBOSE``
      - Enable verbose output from Makefile builds
      - ``0`` (``OFF``)
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 8ece65dd467..7c3fcae5cb3 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -74,7 +74,8 @@ zip-safe=false
 include-package-data=true
 
 [tool.setuptools.packages.find]
-where = ["."]
+include = ["pyarrow"]
+namespaces = false
 
 [tool.setuptools.package-data]
 pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd"]
diff --git a/python/setup.py b/python/setup.py
index d3ef3a09146..60b9a696d97 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -32,7 +32,7 @@
     # Get correct EXT_SUFFIX on Windows (https://bugs.python.org/issue39825)
     from distutils import sysconfig
 
-from setuptools import setup, Extension, Distribution, find_namespace_packages
+from setuptools import setup, Extension, Distribution
 
 from Cython.Distutils import build_ext as _build_ext
 import Cython
@@ -396,21 +396,7 @@ def has_ext_modules(foo):
         return True
 
 
-if strtobool(os.environ.get('PYARROW_INSTALL_TESTS', '1')):
-    packages = find_namespace_packages(include=['pyarrow*'])
-    exclude_package_data = {}
-else:
-    packages = find_namespace_packages(include=['pyarrow*'],
-                                       exclude=["pyarrow.tests*"])
-    # setuptools adds back importable packages even when excluded.
-    # https://github.com/pypa/setuptools/issues/3260
-    # https://github.com/pypa/setuptools/issues/3340#issuecomment-1219383976
-    exclude_package_data = {"pyarrow": ["tests*"]}
-
-
 setup(
-    packages=packages,
-    exclude_package_data=exclude_package_data,
     distclass=BinaryDistribution,
     # Dummy extension to trigger build_ext
     ext_modules=[Extension('__dummy__', sources=[])],

From 9336bbeaa76348367c6a8dd3048088dae203b836 Mon Sep 17 00:00:00 2001
From: Crystal <45134936+CrystalZhou0529@users.noreply.github.com>
Date: Thu, 5 Sep 2024 04:55:37 -0400
Subject: [PATCH 138/157] GH-43967: [C++] Enhance error message for URI parsing
 (#43938)

### Rationale for this change

We want to enhance error message for URI parsing error to provide more information for the syntax error scenario.

When error message is generated from `uriParseSingleUriExA`, the return value might indicate a `URI_ERROR_SYNTAX` error, and `error_pos` would be set to the position causing syntax error. ([uriparser/Uri.h](https://github.com/apache/arrow/blob/c455d6b8c4ae2cb22baceb4c27e1325b973d39e1/cpp/src/arrow/vendored/uriparser/Uri.h#L288))

In the new error message, it includes the character causing syntax error and its position, so users can have a better idea why the error happens.

### What changes are included in this PR?

- Error message change in URI parsing function.

### Are these changes tested?

PR includes unit tests.

### Are there any user-facing changes?

Yes, but only for error message.

* GitHub Issue: #41365
* GitHub Issue: #43967

Authored-by: Crystal Zhou <crystal.zhouxiaoyue@hotmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 cpp/src/arrow/filesystem/filesystem_test.cc |  8 ++++++++
 cpp/src/arrow/util/uri.cc                   | 13 ++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/filesystem/filesystem_test.cc b/cpp/src/arrow/filesystem/filesystem_test.cc
index 8477647b2cd..5072c3a8c25 100644
--- a/cpp/src/arrow/filesystem/filesystem_test.cc
+++ b/cpp/src/arrow/filesystem/filesystem_test.cc
@@ -20,6 +20,7 @@
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include "arrow/filesystem/filesystem.h"
@@ -632,6 +633,13 @@ TEST_F(TestMockFS, FileSystemFromUri) {
   ASSERT_OK_AND_ASSIGN(fs_, FileSystemFromUri("mock:///foo/bar?q=zzz", &path));
   ASSERT_EQ(path, "foo/bar");
   CheckDirs({});
+  ASSERT_OK_AND_ASSIGN(fs_, FileSystemFromUri("mock:/folder+name/bar?q=zzz", &path));
+  ASSERT_EQ(path, "folder+name/bar");
+  CheckDirs({});
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("syntax error at character ' ' (position 12)"),
+      FileSystemFromUri("mock:/folder name/bar", &path));
+  CheckDirs({});
 }
 
 ////////////////////////////////////////////////////////////////////////////
diff --git a/cpp/src/arrow/util/uri.cc b/cpp/src/arrow/util/uri.cc
index 9c0f7f9a596..6c0787a87e0 100644
--- a/cpp/src/arrow/util/uri.cc
+++ b/cpp/src/arrow/util/uri.cc
@@ -250,9 +250,16 @@ Status Uri::Parse(const std::string& uri_string) {
   const auto& s = impl_->KeepString(uri_string);
   impl_->string_rep_ = s;
   const char* error_pos;
-  if (uriParseSingleUriExA(&impl_->uri_, s.data(), s.data() + s.size(), &error_pos) !=
-      URI_SUCCESS) {
-    return Status::Invalid("Cannot parse URI: '", uri_string, "'");
+  int retval =
+      uriParseSingleUriExA(&impl_->uri_, s.data(), s.data() + s.size(), &error_pos);
+  if (retval != URI_SUCCESS) {
+    if (retval == URI_ERROR_SYNTAX) {
+      return Status::Invalid("Cannot parse URI: '", uri_string,
+                             "' due to syntax error at character '", *error_pos,
+                             "' (position ", error_pos - s.data(), ")");
+    } else {
+      return Status::Invalid("Cannot parse URI: '", uri_string, "'");
+    }
   }
 
   const auto scheme = TextRangeToView(impl_->uri_.scheme);

From 032e6a42bfa359b62d0ed4e5d9b44582a558c87e Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 5 Sep 2024 11:46:53 +0200
Subject: [PATCH 139/157] GH-43946: [C++][Parquet] Guard against use of cleared
 decryptor/encryptor (#43947)

This is to get a clearer error rather than an obscure crash, see https://github.com/apache/arrow/issues/43057 for an example.

* GitHub Issue: #43946

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../parquet/encryption/encryption_internal.cc | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/cpp/src/parquet/encryption/encryption_internal.cc b/cpp/src/parquet/encryption/encryption_internal.cc
index a0d9367b619..31cad130a10 100644
--- a/cpp/src/parquet/encryption/encryption_internal.cc
+++ b/cpp/src/parquet/encryption/encryption_internal.cc
@@ -89,6 +89,12 @@ class AesEncryptor::AesEncryptorImpl {
   }
 
  private:
+  void CheckValid() const {
+    if (ctx_ == nullptr) {
+      throw ParquetException("AesEncryptor was wiped out");
+    }
+  }
+
   EVP_CIPHER_CTX* ctx_;
   int32_t aes_mode_;
   int32_t key_length_;
@@ -156,6 +162,8 @@ AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id,
 int32_t AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(
     span<const uint8_t> footer, span<const uint8_t> key, span<const uint8_t> aad,
     span<const uint8_t> nonce, span<uint8_t> encrypted_footer) {
+  CheckValid();
+
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -180,6 +188,8 @@ int32_t AesEncryptor::AesEncryptorImpl::Encrypt(span<const uint8_t> plaintext,
                                                 span<const uint8_t> key,
                                                 span<const uint8_t> aad,
                                                 span<uint8_t> ciphertext) {
+  CheckValid();
+
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -413,6 +423,12 @@ class AesDecryptor::AesDecryptorImpl {
   }
 
  private:
+  void CheckValid() const {
+    if (ctx_ == nullptr) {
+      throw ParquetException("AesDecryptor was wiped out");
+    }
+  }
+
   EVP_CIPHER_CTX* ctx_;
   int32_t aes_mode_;
   int32_t key_length_;
@@ -714,6 +730,8 @@ int32_t AesDecryptor::AesDecryptorImpl::Decrypt(span<const uint8_t> ciphertext,
                                                 span<const uint8_t> key,
                                                 span<const uint8_t> aad,
                                                 span<uint8_t> plaintext) {
+  CheckValid();
+
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -806,4 +824,7 @@ void RandBytes(unsigned char* buf, size_t num) {
 
 void EnsureBackendInitialized() { openssl::EnsureInitialized(); }
 
+#undef ENCRYPT_INIT
+#undef DECRYPT_INIT
+
 }  // namespace parquet::encryption

From 1c9523ba33b3d55d4c34782a04b5a1019f016aec Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Thu, 5 Sep 2024 18:04:06 +0800
Subject: [PATCH 140/157] Added a test that writes and reads a parquet file
 containing a geometry column

---
 cpp/src/parquet/column_writer_test.cc |   1 -
 cpp/src/parquet/reader_test.cc        | 116 ++++++++++++++++++++++++++
 2 files changed, 116 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index 7916f9ce70f..5e78f1138e7 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -1778,7 +1778,6 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     writer->Close();
     this->ReadColumn();
     for (size_t i = 0; i < num_values; i++) {
-      // ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
       const ByteArray& value = this->values_out_[i];
       EXPECT_EQ(21, value.len);
       EXPECT_EQ(1, value.ptr[0]);
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index fb77ba6cbc1..f99d2437b28 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -28,6 +28,8 @@
 #include <gtest/gtest.h>
 
 #include "arrow/array.h"
+#include "arrow/array/array_binary.h"
+#include "arrow/array/builder_binary.h"
 #include "arrow/buffer.h"
 #include "arrow/io/file.h"
 #include "arrow/testing/future_util.h"
@@ -39,13 +41,16 @@
 
 #include "parquet/column_reader.h"
 #include "parquet/column_scanner.h"
+#include "parquet/column_writer.h"
 #include "parquet/file_reader.h"
 #include "parquet/file_writer.h"
 #include "parquet/metadata.h"
 #include "parquet/page_index.h"
 #include "parquet/platform.h"
 #include "parquet/printer.h"
+#include "parquet/statistics.h"
 #include "parquet/test_util.h"
+#include "parquet/types.h"
 
 using arrow::internal::checked_pointer_cast;
 using arrow::internal::Zip;
@@ -1812,4 +1817,115 @@ TEST(PageIndexReaderTest, ReadFileWithoutPageIndex) {
   ASSERT_EQ(nullptr, row_group_index_reader);
 }
 
+TEST(TestFileReader, GeometryLogicalType) {
+  const int num_rows = 1000;
+
+  // Make schema
+  schema::NodeVector fields;
+  fields.push_back(PrimitiveNode::Make(
+      "g", Repetition::REQUIRED,
+      GeometryLogicalType::Make(R"({"id": {"authority": "OGC", "code": "CRS84"}})",
+                                LogicalType::GeometryEdges::PLANAR,
+                                LogicalType::GeometryEncoding::WKB, "metadata0"),
+      Type::BYTE_ARRAY));
+  auto schema = std::static_pointer_cast<GroupNode>(
+      GroupNode::Make("schema", Repetition::REQUIRED, fields));
+
+  // Write small batches and small data pages
+  std::shared_ptr<WriterProperties> writer_props =
+      WriterProperties::Builder().write_batch_size(64)->data_pagesize(128)->build();
+
+  ASSERT_OK_AND_ASSIGN(auto out_file, ::arrow::io::BufferOutputStream::Create());
+  std::shared_ptr<ParquetFileWriter> file_writer =
+      ParquetFileWriter::Open(out_file, schema, writer_props);
+  RowGroupWriter* rg_writer = file_writer->AppendRowGroup();
+
+  // write WKB points to columns
+  auto* writer = static_cast<ByteArrayWriter*>(rg_writer->NextColumn());
+  uint32_t point_wkb_size = 21;
+  std::vector<uint8_t> buffer(point_wkb_size * num_rows);
+  uint8_t* ptr = buffer.data();
+  std::vector<ByteArray> values(num_rows);
+  for (int k = 0; k < num_rows; k++) {
+    // Point with coordinates (k, k + 1), encoded as WKB
+    ptr[0] = 0x01;           // 1: little endian
+    uint32_t geom_type = 1;  // 1: POINT (2D)
+    memcpy(&ptr[1], &geom_type, 4);
+    double x = k;
+    double y = k + 1;
+    memcpy(&ptr[5], &x, 8);
+    memcpy(&ptr[13], &y, 8);
+
+    // Set this WKB value to values_[k]
+    values[k].len = point_wkb_size;
+    values[k].ptr = ptr;
+    ptr += point_wkb_size;
+  }
+  writer->WriteBatch(num_rows, nullptr, nullptr, values.data());
+
+  rg_writer->Close();
+  file_writer->Close();
+
+  // Open the reader
+  ASSERT_OK_AND_ASSIGN(auto file_buf, out_file->Finish());
+  auto in_file = std::make_shared<::arrow::io::BufferReader>(file_buf);
+
+  ReaderProperties reader_props;
+  reader_props.enable_buffered_stream();
+  reader_props.set_buffer_size(64);
+  std::unique_ptr<ParquetFileReader> file_reader =
+      ParquetFileReader::Open(in_file, reader_props);
+
+  // Check that the geometry statistics are correctly written and read
+  std::shared_ptr<FileMetaData> metadata = file_reader->metadata();
+  int num_row_groups = metadata->num_row_groups();
+  for (int i = 0; i < num_row_groups; i++) {
+    std::unique_ptr<RowGroupMetaData> row_group_metadata = metadata->RowGroup(i);
+    std::unique_ptr<ColumnChunkMetaData> column_chunk_metadata =
+        row_group_metadata->ColumnChunk(0);
+    EncodedStatistics encoded_statistics = column_chunk_metadata->statistics()->Encode();
+    EXPECT_TRUE(encoded_statistics.has_geometry_statistics);
+    const EncodedGeometryStatistics& geom_stats =
+        encoded_statistics.geometry_statistics();
+    EXPECT_EQ(1, geom_stats.geometry_types.size());
+    EXPECT_EQ(1, geom_stats.geometry_types[0]);
+    EXPECT_GE(geom_stats.xmin, 0);
+    EXPECT_GT(geom_stats.xmax, geom_stats.xmin);
+    EXPECT_GT(geom_stats.ymin, 0);
+    EXPECT_GT(geom_stats.ymax, geom_stats.ymin);
+  }
+
+  // Check the geometry values
+  auto row_group = file_reader->RowGroup(0);
+  std::shared_ptr<ByteArrayReader> reader =
+      std::static_pointer_cast<ByteArrayReader>(row_group->Column(0));
+  int64_t total_values_read = 0;
+  while (total_values_read < num_rows) {
+    std::vector<ByteArray> out(num_rows);
+    int64_t values_read = 0;
+    int64_t levels_read =
+        reader->ReadBatch(num_rows, nullptr, nullptr, out.data(), &values_read);
+    ASSERT_GE(levels_read, 1);
+    ASSERT_GE(values_read, 1);
+
+    // Check the batch
+    for (int64_t i = 0; i < values_read; i++) {
+      const ByteArray& value = out[i];
+      EXPECT_EQ(21, value.len);
+      EXPECT_EQ(1, value.ptr[0]);
+      uint32_t geom_type = 0;
+      double x = 0;
+      double y = 0;
+      memcpy(&geom_type, &value.ptr[1], 4);
+      memcpy(&x, &value.ptr[5], 8);
+      memcpy(&y, &value.ptr[13], 8);
+      EXPECT_EQ(1, geom_type);
+      EXPECT_DOUBLE_EQ(i + total_values_read, x);
+      EXPECT_DOUBLE_EQ(i + 1 + total_values_read, y);
+    }
+
+    total_values_read += values_read;
+  }
+}
+
 }  // namespace parquet

From 7a24729341d4b7cc56ce072df86d2d28b4ddcf96 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 5 Sep 2024 14:14:58 +0200
Subject: [PATCH 141/157] GH-43969: [CI][Dev] Prune .dockerignore (#43971)

### Rationale for this change

The smaller the better, according to https://docs.docker.com/build/cache/optimize/

### What changes are included in this PR?

Prune obsolete or unnecessary inclusions from `.dockerignore`.

### Are these changes tested?

Yes, by CI.

### Are there any user-facing changes?

No.

* GitHub Issue: #43969

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .dockerignore                     | 21 ++-------------------
 .github/workflows/archery.yml     |  2 ++
 .github/workflows/cpp.yml         |  2 ++
 .github/workflows/docs_light.yml  |  1 +
 .github/workflows/go.yml          |  2 ++
 .github/workflows/integration.yml |  2 ++
 .github/workflows/java.yml        |  2 ++
 .github/workflows/java_jni.yml    |  2 ++
 .github/workflows/js.yml          |  2 ++
 .github/workflows/python.yml      |  2 ++
 .github/workflows/r.yml           |  2 ++
 .github/workflows/ruby.yml        |  2 ++
 .github/workflows/swift.yml       |  2 ++
 13 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 3791cca95e3..1f1715d8e83 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -27,11 +27,11 @@
 # include explicitly
 !ci/**
 !c_glib/Gemfile
-!dev/archery/setup.py
 !dev/release/setup-*.sh
 !docs/requirements*.txt
+!go/go.mod
+!go/go.sum
 !python/requirements*.txt
-!python/manylinux1/**
 !r/DESCRIPTION
 !ruby/Gemfile
 !ruby/red-arrow/Gemfile
@@ -46,20 +46,3 @@
 !ruby/red-parquet/Gemfile
 !ruby/red-parquet/lib/parquet/version.rb
 !ruby/red-parquet/red-parquet.gemspec
-!ruby/red-plasma/Gemfile
-!ruby/red-plasma/lib/plasma/version.rb
-!ruby/red-plasma/red-plasma.gemspec
-!rust/Cargo.toml
-!rust/benchmarks/Cargo.toml
-!rust/arrow/Cargo.toml
-!rust/arrow/benches
-!rust/arrow-flight/Cargo.toml
-!rust/parquet/Cargo.toml
-!rust/parquet/build.rs
-!rust/parquet_derive/Cargo.toml
-!rust/parquet_derive_test/Cargo.toml
-!rust/datafusion/Cargo.toml
-!rust/datafusion/benches
-!rust/integration-testing/Cargo.toml
-!go/go.mod
-!go/go.sum
\ No newline at end of file
diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml
index 2c460710109..e448209056d 100644
--- a/.github/workflows/archery.yml
+++ b/.github/workflows/archery.yml
@@ -20,12 +20,14 @@ name: Archery & Crossbow
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/archery.yml'
       - 'dev/archery/**'
       - 'dev/tasks/**'
       - 'docker-compose.yml'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/archery.yml'
       - 'dev/archery/**'
       - 'dev/tasks/**'
diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index 20bcfcb38da..4a01d2f8e3a 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -20,6 +20,7 @@ name: C++
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/cpp.yml'
       - 'ci/conda_env_*'
       - 'ci/docker/**'
@@ -35,6 +36,7 @@ on:
       - 'testing'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/cpp.yml'
       - 'ci/conda_env_*'
       - 'ci/docker/**'
diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml
index 454affd7fa7..7d540b7cecd 100644
--- a/.github/workflows/docs_light.yml
+++ b/.github/workflows/docs_light.yml
@@ -20,6 +20,7 @@ name: Docs
 on:
   pull_request:
     paths:
+      - '.dockerignore'
       - 'docs/**'
       - '.github/workflows/docs_light.yml'
       - 'ci/docker/conda.dockerfile'
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 9b18b010a0c..d4635492064 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -20,6 +20,7 @@ name: Go
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/go.yml'
       - 'ci/docker/*_go.dockerfile'
       - 'ci/scripts/go_*'
@@ -27,6 +28,7 @@ on:
       - 'go/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/go.yml'
       - 'ci/docker/*_go.dockerfile'
       - 'ci/docker/**'
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index 3a6b568c520..ecf89bff8f6 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -20,6 +20,7 @@ name: Integration
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/integration.yml'
       - 'ci/**'
       - 'dev/archery/**'
@@ -33,6 +34,7 @@ on:
       - 'format/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/integration.yml'
       - 'ci/**'
       - 'dev/archery/**'
diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml
index 8560f0dd1cb..57f834bcbab 100644
--- a/.github/workflows/java.yml
+++ b/.github/workflows/java.yml
@@ -20,6 +20,7 @@ name: Java
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/java.yml'
       - 'ci/docker/*java*'
       - 'ci/scripts/java*.sh'
@@ -29,6 +30,7 @@ on:
       - 'java/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/java.yml'
       - 'ci/docker/*java*'
       - 'ci/scripts/java*.sh'
diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml
index f204d6459ae..f2ecc801dc7 100644
--- a/.github/workflows/java_jni.yml
+++ b/.github/workflows/java_jni.yml
@@ -20,6 +20,7 @@ name: Java JNI
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/java_jni.yml'
       - 'ci/docker/**'
       - 'ci/scripts/cpp_build.sh'
@@ -29,6 +30,7 @@ on:
       - 'java/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/java_jni.yml'
       - 'ci/docker/**'
       - 'ci/scripts/cpp_build.sh'
diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml
index 4ab9831924f..17b57c42b62 100644
--- a/.github/workflows/js.yml
+++ b/.github/workflows/js.yml
@@ -20,12 +20,14 @@ name: NodeJS
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/js.yml'
       - 'ci/docker/*js.dockerfile'
       - 'ci/scripts/js_*'
       - 'js/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/js.yml'
       - 'ci/docker/*js.dockerfile'
       - 'ci/scripts/js_*'
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index b88ea7ce4f1..6e83b727593 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -20,6 +20,7 @@ name: Python
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/python.yml'
       - 'ci/**'
       - 'cpp/**'
@@ -27,6 +28,7 @@ on:
       - 'python/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/python.yml'
       - 'ci/**'
       - 'cpp/**'
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 7fc45777bf6..fbc2ebe0bc5 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -20,6 +20,7 @@ name: R
 on:
   push:
     paths:
+      - '.dockerignore'
       - ".github/workflows/r.yml"
       - "ci/docker/**"
       - "ci/etc/rprofile"
@@ -32,6 +33,7 @@ on:
       - "r/**"
   pull_request:
     paths:
+      - '.dockerignore'
       - ".github/workflows/r.yml"
       - "ci/docker/**"
       - "ci/etc/rprofile"
diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml
index 228bacb77e5..c4a7f31f4a9 100644
--- a/.github/workflows/ruby.yml
+++ b/.github/workflows/ruby.yml
@@ -20,6 +20,7 @@ name: C GLib & Ruby
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/ruby.yml'
       - 'ci/docker/**'
       - 'ci/scripts/c_glib_*'
@@ -33,6 +34,7 @@ on:
       - 'ruby/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/ruby.yml'
       - 'ci/docker/**'
       - 'ci/scripts/c_glib_*'
diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml
index 1b3c9eca181..86eb113dfc8 100644
--- a/.github/workflows/swift.yml
+++ b/.github/workflows/swift.yml
@@ -20,6 +20,7 @@ name: Swift
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/swift.yml'
       - 'ci/docker/*swift*'
       - 'ci/scripts/swift_*'
@@ -27,6 +28,7 @@ on:
       - 'swift/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/swift.yml'
       - 'ci/docker/*swift*'
       - 'ci/scripts/swift_*'

From c2123b8b90ab952f854912459bb33ebaf0d99611 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 5 Sep 2024 18:10:57 +0200
Subject: [PATCH 142/157] GH-40154: [C++][Parquet] Separate encoders and
 decoder (#43972)

### Rationale for this change

`encoding.cc` is quite large nowadays : around 4000 lines of code, which makes code navigation cumbersome. It combines the functionality of encoders and decoders, however, those use distinct infrastructures and do not share any code.

Other areas of Parquet tend to separate the reading and writing facilities: for example, `column_reader.cc` vs. `column_writer.cc`.

### What changes are included in this PR?

The main change is to move encoders to `encoder.cc`, decoders to `decoder.cc`, and remove `encoding.cc`.

A small improvement is also to remove the inclusion of `arrow/util/spaced.h` in `encoding.h` by moving the `TypedDecoder<T>::DecodeSpaced` implementation into `decoder.cc`.

Note the massive code shuffle may obscure the git history quite a bit. `git log -C` doesn't seem able to track earlier versions of the encoder and decoder code, but `git blame -C` is.

### Are these changes tested?

By existing tests.

### Are there any user-facing changes?

No.

* GitHub Issue: #40154

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/parquet/CMakeLists.txt              |    3 +-
 cpp/src/parquet/{encoding.cc => decoder.cc} | 1854 +------------------
 cpp/src/parquet/encoder.cc                  | 1783 ++++++++++++++++++
 cpp/src/parquet/encoding.h                  |   36 +-
 4 files changed, 1866 insertions(+), 1810 deletions(-)
 rename cpp/src/parquet/{encoding.cc => decoder.cc} (57%)
 create mode 100644 cpp/src/parquet/encoder.cc

diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 17574261d89..b984ef77adb 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -165,7 +165,8 @@ set(PARQUET_SRCS
     column_reader.cc
     column_scanner.cc
     column_writer.cc
-    encoding.cc
+    decoder.cc
+    encoder.cc
     encryption/encryption.cc
     encryption/internal_file_decryptor.cc
     encryption/internal_file_encryptor.cc
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/decoder.cc
similarity index 57%
rename from cpp/src/parquet/encoding.cc
rename to cpp/src/parquet/decoder.cc
index 16a1e249273..70810461605 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -28,23 +28,23 @@
 #include <vector>
 
 #include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_dict.h"
-#include "arrow/stl_allocator.h"
+#include "arrow/array/builder_primitive.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_block_counter.h"
-#include "arrow/util/bit_run_reader.h"
 #include "arrow/util/bit_stream_utils_internal.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
-#include "arrow/util/bitmap_writer.h"
 #include "arrow/util/byte_stream_split_internal.h"
 #include "arrow/util/checked_cast.h"
-#include "arrow/util/hashing.h"
 #include "arrow/util/int_util_overflow.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/rle_encoding_internal.h"
+#include "arrow/util/spaced.h"
 #include "arrow/util/ubsan.h"
 #include "arrow/visit_data_inline.h"
+
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
@@ -57,901 +57,12 @@ using arrow::VisitNullBitmapInline;
 using arrow::internal::AddWithOverflow;
 using arrow::internal::BitBlockCounter;
 using arrow::internal::checked_cast;
-using arrow::internal::MultiplyWithOverflow;
-using arrow::internal::SafeSignedSubtract;
-using arrow::internal::SubtractWithOverflow;
 using arrow::util::SafeLoad;
 using arrow::util::SafeLoadAs;
-using std::string_view;
-
-template <typename T>
-using ArrowPoolVector = std::vector<T, ::arrow::stl::allocator<T>>;
 
 namespace parquet {
 namespace {
 
-// The Parquet spec isn't very clear whether ByteArray lengths are signed or
-// unsigned, but the Java implementation uses signed ints.
-constexpr size_t kMaxByteArraySize = std::numeric_limits<int32_t>::max();
-
-// ----------------------------------------------------------------------
-// Encoders
-// ----------------------------------------------------------------------
-
-class EncoderImpl : virtual public Encoder {
- public:
-  EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool)
-      : descr_(descr),
-        encoding_(encoding),
-        pool_(pool),
-        type_length_(descr ? descr->type_length() : -1) {}
-
-  Encoding::type encoding() const override { return encoding_; }
-
-  MemoryPool* memory_pool() const override { return pool_; }
-
- protected:
-  // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
-  const ColumnDescriptor* descr_;
-  const Encoding::type encoding_;
-  MemoryPool* pool_;
-
-  /// Type length from descr
-  const int type_length_;
-};
-
-// ----------------------------------------------------------------------
-// Plain encoder implementation
-
-template <typename DType>
-class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
- public:
-  using T = typename DType::c_type;
-
-  explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
-      : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
-
-  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
-
-  std::shared_ptr<Buffer> FlushValues() override {
-    std::shared_ptr<Buffer> buffer;
-    PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
-    return buffer;
-  }
-
-  using TypedEncoder<DType>::Put;
-
-  void Put(const T* buffer, int num_values) override;
-
-  void Put(const ::arrow::Array& values) override;
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    if (valid_bits != NULLPTR) {
-      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                   this->memory_pool()));
-      T* data = buffer->template mutable_data_as<T>();
-      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-          src, num_values, valid_bits, valid_bits_offset, data);
-      Put(data, num_valid_values);
-    } else {
-      Put(src, num_values);
-    }
-  }
-
-  void UnsafePutByteArray(const void* data, uint32_t length) {
-    DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL";
-    sink_.UnsafeAppend(&length, sizeof(uint32_t));
-    sink_.UnsafeAppend(data, static_cast<int64_t>(length));
-  }
-
-  void Put(const ByteArray& val) {
-    // Write the result to the output stream
-    const int64_t increment = static_cast<int64_t>(val.len + sizeof(uint32_t));
-    if (ARROW_PREDICT_FALSE(sink_.length() + increment > sink_.capacity())) {
-      PARQUET_THROW_NOT_OK(sink_.Reserve(increment));
-    }
-    UnsafePutByteArray(val.ptr, val.len);
-  }
-
- protected:
-  template <typename ArrayType>
-  void PutBinaryArray(const ArrayType& array) {
-    const int64_t total_bytes =
-        array.value_offset(array.length()) - array.value_offset(0);
-    PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes + array.length() * sizeof(uint32_t)));
-
-    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
-        *array.data(),
-        [&](::std::string_view view) {
-          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
-            return Status::Invalid(
-                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
-          }
-          UnsafePutByteArray(view.data(), static_cast<uint32_t>(view.size()));
-          return Status::OK();
-        },
-        []() { return Status::OK(); }));
-  }
-
-  ::arrow::BufferBuilder sink_;
-};
-
-template <typename DType>
-void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
-  if (num_values > 0) {
-    PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
-  }
-}
-
-template <>
-inline void PlainEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
-  for (int i = 0; i < num_values; ++i) {
-    Put(src[i]);
-  }
-}
-
-template <typename ArrayType>
-void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) {
-  if (values.type_id() != ArrayType::TypeClass::type_id) {
-    std::string type_name = ArrayType::TypeClass::type_name();
-    throw ParquetException("direct put to " + type_name + " from " +
-                           values.type()->ToString() + " not supported");
-  }
-
-  using value_type = typename ArrayType::value_type;
-  constexpr auto value_size = sizeof(value_type);
-  auto raw_values = checked_cast<const ArrayType&>(values).raw_values();
-
-  if (values.null_count() == 0) {
-    // no nulls, just dump the data
-    PARQUET_THROW_NOT_OK(sink->Append(raw_values, values.length() * value_size));
-  } else {
-    PARQUET_THROW_NOT_OK(
-        sink->Reserve((values.length() - values.null_count()) * value_size));
-
-    for (int64_t i = 0; i < values.length(); i++) {
-      if (values.IsValid(i)) {
-        sink->UnsafeAppend(&raw_values[i], value_size);
-      }
-    }
-  }
-}
-
-template <>
-void PlainEncoder<Int32Type>::Put(const ::arrow::Array& values) {
-  DirectPutImpl<::arrow::Int32Array>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<Int64Type>::Put(const ::arrow::Array& values) {
-  DirectPutImpl<::arrow::Int64Array>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<Int96Type>::Put(const ::arrow::Array& values) {
-  ParquetException::NYI("direct put to Int96");
-}
-
-template <>
-void PlainEncoder<FloatType>::Put(const ::arrow::Array& values) {
-  DirectPutImpl<::arrow::FloatArray>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<DoubleType>::Put(const ::arrow::Array& values) {
-  DirectPutImpl<::arrow::DoubleArray>(values, &sink_);
-}
-
-template <typename DType>
-void PlainEncoder<DType>::Put(const ::arrow::Array& values) {
-  ParquetException::NYI("direct put of " + values.type()->ToString());
-}
-
-void AssertBaseBinary(const ::arrow::Array& values) {
-  if (!::arrow::is_base_binary_like(values.type_id())) {
-    throw ParquetException("Only BaseBinaryArray and subclasses supported");
-  }
-}
-
-template <>
-inline void PlainEncoder<ByteArrayType>::Put(const ::arrow::Array& values) {
-  AssertBaseBinary(values);
-
-  if (::arrow::is_binary_like(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
-  } else {
-    DCHECK(::arrow::is_large_binary_like(values.type_id()));
-    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
-  }
-}
-
-void AssertFixedSizeBinary(const ::arrow::Array& values, int type_length) {
-  if (!::arrow::is_fixed_size_binary(values.type_id())) {
-    throw ParquetException("Only FixedSizeBinaryArray and subclasses supported");
-  }
-  if (checked_cast<const ::arrow::FixedSizeBinaryType&>(*values.type()).byte_width() !=
-      type_length) {
-    throw ParquetException("Size mismatch: " + values.type()->ToString() +
-                           " should have been " + std::to_string(type_length) + " wide");
-  }
-}
-
-template <>
-inline void PlainEncoder<FLBAType>::Put(const ::arrow::Array& values) {
-  AssertFixedSizeBinary(values, descr_->type_length());
-  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-
-  if (data.null_count() == 0) {
-    // no nulls, just dump the data
-    PARQUET_THROW_NOT_OK(
-        sink_.Append(data.raw_values(), data.length() * data.byte_width()));
-  } else {
-    const int64_t total_bytes =
-        data.length() * data.byte_width() - data.null_count() * data.byte_width();
-    PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
-    for (int64_t i = 0; i < data.length(); i++) {
-      if (data.IsValid(i)) {
-        sink_.UnsafeAppend(data.Value(i), data.byte_width());
-      }
-    }
-  }
-}
-
-template <>
-inline void PlainEncoder<FLBAType>::Put(const FixedLenByteArray* src, int num_values) {
-  if (descr_->type_length() == 0) {
-    return;
-  }
-  for (int i = 0; i < num_values; ++i) {
-    // Write the result to the output stream
-    DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL";
-    PARQUET_THROW_NOT_OK(sink_.Append(src[i].ptr, descr_->type_length()));
-  }
-}
-
-template <>
-class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEncoder {
- public:
-  explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
-      : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
-
-  int64_t EstimatedDataEncodedSize() override;
-  std::shared_ptr<Buffer> FlushValues() override;
-
-  void Put(const bool* src, int num_values) override;
-
-  void Put(const std::vector<bool>& src, int num_values) override;
-
-  void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    if (valid_bits != NULLPTR) {
-      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                   this->memory_pool()));
-      T* data = buffer->mutable_data_as<T>();
-      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-          src, num_values, valid_bits, valid_bits_offset, data);
-      Put(data, num_valid_values);
-    } else {
-      Put(src, num_values);
-    }
-  }
-
-  void Put(const ::arrow::Array& values) override {
-    if (values.type_id() != ::arrow::Type::BOOL) {
-      throw ParquetException("direct put to boolean from " + values.type()->ToString() +
-                             " not supported");
-    }
-    const auto& data = checked_cast<const ::arrow::BooleanArray&>(values);
-
-    if (data.null_count() == 0) {
-      // no nulls, just dump the data
-      PARQUET_THROW_NOT_OK(sink_.Reserve(data.length()));
-      sink_.UnsafeAppend(data.data()->GetValues<uint8_t>(1, 0), data.offset(),
-                         data.length());
-    } else {
-      PARQUET_THROW_NOT_OK(sink_.Reserve(data.length() - data.null_count()));
-      for (int64_t i = 0; i < data.length(); i++) {
-        if (data.IsValid(i)) {
-          sink_.UnsafeAppend(data.Value(i));
-        }
-      }
-    }
-  }
-
- private:
-  ::arrow::TypedBufferBuilder<bool> sink_;
-
-  template <typename SequenceType>
-  void PutImpl(const SequenceType& src, int num_values);
-};
-
-template <typename SequenceType>
-void PlainEncoder<BooleanType>::PutImpl(const SequenceType& src, int num_values) {
-  PARQUET_THROW_NOT_OK(sink_.Reserve(num_values));
-  for (int i = 0; i < num_values; ++i) {
-    sink_.UnsafeAppend(src[i]);
-  }
-}
-
-int64_t PlainEncoder<BooleanType>::EstimatedDataEncodedSize() {
-  return ::arrow::bit_util::BytesForBits(sink_.length());
-}
-
-std::shared_ptr<Buffer> PlainEncoder<BooleanType>::FlushValues() {
-  std::shared_ptr<Buffer> buffer;
-  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
-  return buffer;
-}
-
-void PlainEncoder<BooleanType>::Put(const bool* src, int num_values) {
-  PutImpl(src, num_values);
-}
-
-void PlainEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
-  PutImpl(src, num_values);
-}
-
-// ----------------------------------------------------------------------
-// DictEncoder<T> implementations
-
-template <typename DType>
-struct DictEncoderTraits {
-  using c_type = typename DType::c_type;
-  using MemoTableType = ::arrow::internal::ScalarMemoTable<c_type>;
-};
-
-template <>
-struct DictEncoderTraits<ByteArrayType> {
-  using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
-};
-
-template <>
-struct DictEncoderTraits<FLBAType> {
-  using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
-};
-
-// Initially 1024 elements
-static constexpr int32_t kInitialHashTableSize = 1 << 10;
-
-int RlePreserveBufferSize(int num_values, int bit_width) {
-  // Note: because of the way RleEncoder::CheckBufferFull()
-  // is called, we have to reserve an extra "RleEncoder::MinBufferSize"
-  // bytes. These extra bytes won't be used but not reserving them
-  // would cause the encoder to fail.
-  return ::arrow::util::RleEncoder::MaxBufferSize(bit_width, num_values) +
-         ::arrow::util::RleEncoder::MinBufferSize(bit_width);
-}
-
-/// See the dictionary encoding section of
-/// https://github.com/Parquet/parquet-format.  The encoding supports
-/// streaming encoding. Values are encoded as they are added while the
-/// dictionary is being constructed. At any time, the buffered values
-/// can be written out with the current dictionary size. More values
-/// can then be added to the encoder, including new dictionary
-/// entries.
-template <typename DType>
-class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
-  using MemoTableType = typename DictEncoderTraits<DType>::MemoTableType;
-
- public:
-  typedef typename DType::c_type T;
-
-  /// In data page, the bit width used to encode the entry
-  /// ids stored as 1 byte (max bit width = 32).
-  constexpr static int32_t kDataPageBitWidthBytes = 1;
-
-  explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool)
-      : EncoderImpl(desc, Encoding::RLE_DICTIONARY, pool),
-        buffered_indices_(::arrow::stl::allocator<int32_t>(pool)),
-        dict_encoded_size_(0),
-        memo_table_(pool, kInitialHashTableSize) {}
-
-  ~DictEncoderImpl() override = default;
-
-  int dict_encoded_size() const override { return dict_encoded_size_; }
-
-  int WriteIndices(uint8_t* buffer, int buffer_len) override {
-    // Write bit width in first byte
-    *buffer = static_cast<uint8_t>(bit_width());
-    ++buffer;
-    --buffer_len;
-
-    ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
-
-    for (int32_t index : buffered_indices_) {
-      if (ARROW_PREDICT_FALSE(!encoder.Put(index))) return -1;
-    }
-    encoder.Flush();
-
-    ClearIndices();
-    return kDataPageBitWidthBytes + encoder.len();
-  }
-
-  /// Returns a conservative estimate of the number of bytes needed to encode the buffered
-  /// indices. Used to size the buffer passed to WriteIndices().
-  int64_t EstimatedDataEncodedSize() override {
-    return kDataPageBitWidthBytes +
-           RlePreserveBufferSize(static_cast<int>(buffered_indices_.size()), bit_width());
-  }
-
-  /// The minimum bit width required to encode the currently buffered indices.
-  int bit_width() const override {
-    if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0;
-    if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1;
-    return bit_util::Log2(num_entries());
-  }
-
-  /// Encode value. Note that this does not actually write any data, just
-  /// buffers the value's index to be written later.
-  inline void Put(const T& value);
-
-  // Not implemented for other data types
-  inline void PutByteArray(const void* ptr, int32_t length);
-
-  void Put(const T* src, int num_values) override {
-    for (int32_t i = 0; i < num_values; i++) {
-      Put(SafeLoad(src + i));
-    }
-  }
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    ::arrow::internal::VisitSetBitRunsVoid(valid_bits, valid_bits_offset, num_values,
-                                           [&](int64_t position, int64_t length) {
-                                             for (int64_t i = 0; i < length; i++) {
-                                               Put(SafeLoad(src + i + position));
-                                             }
-                                           });
-  }
-
-  using TypedEncoder<DType>::Put;
-
-  void Put(const ::arrow::Array& values) override;
-  void PutDictionary(const ::arrow::Array& values) override;
-
-  template <typename ArrowType, typename T = typename ArrowType::c_type>
-  void PutIndicesTyped(const ::arrow::Array& data) {
-    auto values = data.data()->GetValues<T>(1);
-    size_t buffer_position = buffered_indices_.size();
-    buffered_indices_.resize(buffer_position +
-                             static_cast<size_t>(data.length() - data.null_count()));
-    ::arrow::internal::VisitSetBitRunsVoid(
-        data.null_bitmap_data(), data.offset(), data.length(),
-        [&](int64_t position, int64_t length) {
-          for (int64_t i = 0; i < length; ++i) {
-            buffered_indices_[buffer_position++] =
-                static_cast<int32_t>(values[i + position]);
-          }
-        });
-  }
-
-  void PutIndices(const ::arrow::Array& data) override {
-    switch (data.type()->id()) {
-      case ::arrow::Type::UINT8:
-      case ::arrow::Type::INT8:
-        return PutIndicesTyped<::arrow::UInt8Type>(data);
-      case ::arrow::Type::UINT16:
-      case ::arrow::Type::INT16:
-        return PutIndicesTyped<::arrow::UInt16Type>(data);
-      case ::arrow::Type::UINT32:
-      case ::arrow::Type::INT32:
-        return PutIndicesTyped<::arrow::UInt32Type>(data);
-      case ::arrow::Type::UINT64:
-      case ::arrow::Type::INT64:
-        return PutIndicesTyped<::arrow::UInt64Type>(data);
-      default:
-        throw ParquetException("Passed non-integer array to PutIndices");
-    }
-  }
-
-  std::shared_ptr<Buffer> FlushValues() override {
-    std::shared_ptr<ResizableBuffer> buffer =
-        AllocateBuffer(this->pool_, EstimatedDataEncodedSize());
-    int result_size = WriteIndices(buffer->mutable_data(),
-                                   static_cast<int>(EstimatedDataEncodedSize()));
-    PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false));
-    return buffer;
-  }
-
-  /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
-  /// dict_encoded_size() bytes.
-  void WriteDict(uint8_t* buffer) const override;
-
-  /// The number of entries in the dictionary.
-  int num_entries() const override { return memo_table_.size(); }
-
- private:
-  /// Clears all the indices (but leaves the dictionary).
-  void ClearIndices() { buffered_indices_.clear(); }
-
-  /// Indices that have not yet be written out by WriteIndices().
-  ArrowPoolVector<int32_t> buffered_indices_;
-
-  template <typename ArrayType>
-  void PutBinaryArray(const ArrayType& array) {
-    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
-        *array.data(),
-        [&](::std::string_view view) {
-          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
-            return Status::Invalid(
-                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
-          }
-          PutByteArray(view.data(), static_cast<uint32_t>(view.size()));
-          return Status::OK();
-        },
-        []() { return Status::OK(); }));
-  }
-
-  template <typename ArrayType>
-  void PutBinaryDictionaryArray(const ArrayType& array) {
-    DCHECK_EQ(array.null_count(), 0);
-    for (int64_t i = 0; i < array.length(); i++) {
-      auto v = array.GetView(i);
-      if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) {
-        throw ParquetException(
-            "Parquet cannot store strings with size 2GB or more, got: ", v.size());
-      }
-      dict_encoded_size_ += static_cast<int>(v.size() + sizeof(uint32_t));
-      int32_t unused_memo_index;
-      PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(
-          v.data(), static_cast<int32_t>(v.size()), &unused_memo_index));
-    }
-  }
-
-  /// The number of bytes needed to encode the dictionary.
-  int dict_encoded_size_;
-
-  MemoTableType memo_table_;
-};
-
-template <typename DType>
-void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) const {
-  // For primitive types, only a memcpy
-  DCHECK_EQ(static_cast<size_t>(dict_encoded_size_), sizeof(T) * memo_table_.size());
-  memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer));
-}
-
-// ByteArray and FLBA already have the dictionary encoded in their data heaps
-template <>
-void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) const {
-  memo_table_.VisitValues(0, [&buffer](::std::string_view v) {
-    uint32_t len = static_cast<uint32_t>(v.length());
-    memcpy(buffer, &len, sizeof(len));
-    buffer += sizeof(len);
-    memcpy(buffer, v.data(), len);
-    buffer += len;
-  });
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::WriteDict(uint8_t* buffer) const {
-  memo_table_.VisitValues(0, [&](::std::string_view v) {
-    DCHECK_EQ(v.length(), static_cast<size_t>(type_length_));
-    memcpy(buffer, v.data(), type_length_);
-    buffer += type_length_;
-  });
-}
-
-template <typename DType>
-inline void DictEncoderImpl<DType>::Put(const T& v) {
-  // Put() implementation for primitive types
-  auto on_found = [](int32_t memo_index) {};
-  auto on_not_found = [this](int32_t memo_index) {
-    dict_encoded_size_ += static_cast<int>(sizeof(T));
-  };
-
-  int32_t memo_index;
-  PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(v, on_found, on_not_found, &memo_index));
-  buffered_indices_.push_back(memo_index);
-}
-
-template <typename DType>
-inline void DictEncoderImpl<DType>::PutByteArray(const void* ptr, int32_t length) {
-  DCHECK(false);
-}
-
-template <>
-inline void DictEncoderImpl<ByteArrayType>::PutByteArray(const void* ptr,
-                                                         int32_t length) {
-  static const uint8_t empty[] = {0};
-
-  auto on_found = [](int32_t memo_index) {};
-  auto on_not_found = [&](int32_t memo_index) {
-    dict_encoded_size_ += static_cast<int>(length + sizeof(uint32_t));
-  };
-
-  DCHECK(ptr != nullptr || length == 0);
-  ptr = (ptr != nullptr) ? ptr : empty;
-  int32_t memo_index;
-  PARQUET_THROW_NOT_OK(
-      memo_table_.GetOrInsert(ptr, length, on_found, on_not_found, &memo_index));
-  buffered_indices_.push_back(memo_index);
-}
-
-template <>
-inline void DictEncoderImpl<ByteArrayType>::Put(const ByteArray& val) {
-  return PutByteArray(val.ptr, static_cast<int32_t>(val.len));
-}
-
-template <>
-inline void DictEncoderImpl<FLBAType>::Put(const FixedLenByteArray& v) {
-  static const uint8_t empty[] = {0};
-
-  auto on_found = [](int32_t memo_index) {};
-  auto on_not_found = [this](int32_t memo_index) { dict_encoded_size_ += type_length_; };
-
-  DCHECK(v.ptr != nullptr || type_length_ == 0);
-  const void* ptr = (v.ptr != nullptr) ? v.ptr : empty;
-  int32_t memo_index;
-  PARQUET_THROW_NOT_OK(
-      memo_table_.GetOrInsert(ptr, type_length_, on_found, on_not_found, &memo_index));
-  buffered_indices_.push_back(memo_index);
-}
-
-template <>
-void DictEncoderImpl<Int96Type>::Put(const ::arrow::Array& values) {
-  ParquetException::NYI("Direct put to Int96");
-}
-
-template <>
-void DictEncoderImpl<Int96Type>::PutDictionary(const ::arrow::Array& values) {
-  ParquetException::NYI("Direct put to Int96");
-}
-
-template <typename DType>
-void DictEncoderImpl<DType>::Put(const ::arrow::Array& values) {
-  using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
-  const auto& data = checked_cast<const ArrayType&>(values);
-  if (data.null_count() == 0) {
-    // no nulls, just dump the data
-    for (int64_t i = 0; i < data.length(); i++) {
-      Put(data.Value(i));
-    }
-  } else {
-    for (int64_t i = 0; i < data.length(); i++) {
-      if (data.IsValid(i)) {
-        Put(data.Value(i));
-      }
-    }
-  }
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::Put(const ::arrow::Array& values) {
-  AssertFixedSizeBinary(values, type_length_);
-  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-  if (data.null_count() == 0) {
-    // no nulls, just dump the data
-    for (int64_t i = 0; i < data.length(); i++) {
-      Put(FixedLenByteArray(data.Value(i)));
-    }
-  } else {
-    std::vector<uint8_t> empty(type_length_, 0);
-    for (int64_t i = 0; i < data.length(); i++) {
-      if (data.IsValid(i)) {
-        Put(FixedLenByteArray(data.Value(i)));
-      }
-    }
-  }
-}
-
-template <>
-void DictEncoderImpl<ByteArrayType>::Put(const ::arrow::Array& values) {
-  AssertBaseBinary(values);
-  if (::arrow::is_binary_like(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
-  } else {
-    DCHECK(::arrow::is_large_binary_like(values.type_id()));
-    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
-  }
-}
-
-template <typename DType>
-void AssertCanPutDictionary(DictEncoderImpl<DType>* encoder, const ::arrow::Array& dict) {
-  if (dict.null_count() > 0) {
-    throw ParquetException("Inserted dictionary cannot contain nulls");
-  }
-
-  if (encoder->num_entries() > 0) {
-    throw ParquetException("Can only call PutDictionary on an empty DictEncoder");
-  }
-}
-
-template <typename DType>
-void DictEncoderImpl<DType>::PutDictionary(const ::arrow::Array& values) {
-  AssertCanPutDictionary(this, values);
-
-  using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
-  const auto& data = checked_cast<const ArrayType&>(values);
-
-  dict_encoded_size_ += static_cast<int>(sizeof(typename DType::c_type) * data.length());
-  for (int64_t i = 0; i < data.length(); i++) {
-    int32_t unused_memo_index;
-    PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(data.Value(i), &unused_memo_index));
-  }
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::PutDictionary(const ::arrow::Array& values) {
-  AssertFixedSizeBinary(values, type_length_);
-  AssertCanPutDictionary(this, values);
-
-  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-
-  dict_encoded_size_ += static_cast<int>(type_length_ * data.length());
-  for (int64_t i = 0; i < data.length(); i++) {
-    int32_t unused_memo_index;
-    PARQUET_THROW_NOT_OK(
-        memo_table_.GetOrInsert(data.Value(i), type_length_, &unused_memo_index));
-  }
-}
-
-template <>
-void DictEncoderImpl<ByteArrayType>::PutDictionary(const ::arrow::Array& values) {
-  AssertBaseBinary(values);
-  AssertCanPutDictionary(this, values);
-
-  if (::arrow::is_binary_like(values.type_id())) {
-    PutBinaryDictionaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
-  } else {
-    DCHECK(::arrow::is_large_binary_like(values.type_id()));
-    PutBinaryDictionaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
-  }
-}
-
-// ----------------------------------------------------------------------
-// ByteStreamSplitEncoder<T> implementations
-
-// Common base class for all types
-
-template <typename DType>
-class ByteStreamSplitEncoderBase : public EncoderImpl,
-                                   virtual public TypedEncoder<DType> {
- public:
-  using T = typename DType::c_type;
-  using TypedEncoder<DType>::Put;
-
-  ByteStreamSplitEncoderBase(const ColumnDescriptor* descr, int byte_width,
-                             ::arrow::MemoryPool* pool)
-      : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool),
-        sink_{pool},
-        byte_width_(byte_width),
-        num_values_in_buffer_{0} {}
-
-  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
-
-  std::shared_ptr<Buffer> FlushValues() override {
-    if (byte_width_ == 1) {
-      // Special-cased fast path
-      PARQUET_ASSIGN_OR_THROW(auto buf, sink_.Finish());
-      return buf;
-    }
-    auto output_buffer = AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize());
-    uint8_t* output_buffer_raw = output_buffer->mutable_data();
-    const uint8_t* raw_values = sink_.data();
-    ::arrow::util::internal::ByteStreamSplitEncode(
-        raw_values, /*width=*/byte_width_, num_values_in_buffer_, output_buffer_raw);
-    sink_.Reset();
-    num_values_in_buffer_ = 0;
-    return output_buffer;
-  }
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    if (valid_bits != NULLPTR) {
-      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                   this->memory_pool()));
-      T* data = buffer->template mutable_data_as<T>();
-      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-          src, num_values, valid_bits, valid_bits_offset, data);
-      Put(data, num_valid_values);
-    } else {
-      Put(src, num_values);
-    }
-  }
-
- protected:
-  ::arrow::BufferBuilder sink_;
-  // Required because type_length_ is only filled in for FLBA
-  const int byte_width_;
-  int64_t num_values_in_buffer_;
-};
-
-// BYTE_STREAM_SPLIT encoder implementation for FLOAT, DOUBLE, INT32, INT64
-
-template <typename DType>
-class ByteStreamSplitEncoder : public ByteStreamSplitEncoderBase<DType> {
- public:
-  using T = typename DType::c_type;
-  using ArrowType = typename EncodingTraits<DType>::ArrowType;
-
-  ByteStreamSplitEncoder(const ColumnDescriptor* descr,
-                         ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
-      : ByteStreamSplitEncoderBase<DType>(descr,
-                                          /*byte_width=*/static_cast<int>(sizeof(T)),
-                                          pool) {}
-
-  // Inherit Put(const std::vector<T>&...)
-  using TypedEncoder<DType>::Put;
-
-  void Put(const T* buffer, int num_values) override {
-    if (num_values > 0) {
-      PARQUET_THROW_NOT_OK(
-          this->sink_.Append(reinterpret_cast<const uint8_t*>(buffer),
-                             num_values * static_cast<int64_t>(sizeof(T))));
-      this->num_values_in_buffer_ += num_values;
-    }
-  }
-
-  void Put(const ::arrow::Array& values) override {
-    if (values.type_id() != ArrowType::type_id) {
-      throw ParquetException(std::string() + "direct put from " +
-                             values.type()->ToString() + " not supported");
-    }
-    const auto& data = *values.data();
-    this->PutSpaced(data.GetValues<typename ArrowType::c_type>(1),
-                    static_cast<int>(data.length), data.GetValues<uint8_t>(0, 0),
-                    data.offset);
-  }
-};
-
-// BYTE_STREAM_SPLIT encoder implementation for FLBA
-
-template <>
-class ByteStreamSplitEncoder<FLBAType> : public ByteStreamSplitEncoderBase<FLBAType> {
- public:
-  using DType = FLBAType;
-  using T = FixedLenByteArray;
-  using ArrowType = ::arrow::FixedSizeBinaryArray;
-
-  ByteStreamSplitEncoder(const ColumnDescriptor* descr,
-                         ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
-      : ByteStreamSplitEncoderBase<DType>(descr,
-                                          /*byte_width=*/descr->type_length(), pool) {}
-
-  // Inherit Put(const std::vector<T>&...)
-  using TypedEncoder<DType>::Put;
-
-  void Put(const T* buffer, int num_values) override {
-    if (byte_width_ > 0) {
-      const int64_t total_bytes = static_cast<int64_t>(num_values) * byte_width_;
-      PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
-      for (int i = 0; i < num_values; ++i) {
-        // Write the result to the output stream
-        DCHECK(buffer[i].ptr != nullptr) << "Value ptr cannot be NULL";
-        sink_.UnsafeAppend(buffer[i].ptr, byte_width_);
-      }
-    }
-    this->num_values_in_buffer_ += num_values;
-  }
-
-  void Put(const ::arrow::Array& values) override {
-    AssertFixedSizeBinary(values, byte_width_);
-    const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-    if (data.null_count() == 0) {
-      // no nulls, just buffer the data
-      PARQUET_THROW_NOT_OK(sink_.Append(data.raw_values(), data.length() * byte_width_));
-      this->num_values_in_buffer_ += data.length();
-    } else {
-      const int64_t num_values = data.length() - data.null_count();
-      const int64_t total_bytes = num_values * byte_width_;
-      PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
-      // TODO use VisitSetBitRunsVoid
-      for (int64_t i = 0; i < data.length(); i++) {
-        if (data.IsValid(i)) {
-          sink_.UnsafeAppend(data.Value(i), byte_width_);
-        }
-      }
-      this->num_values_in_buffer_ += num_values;
-    }
-  }
-};
-
-// ----------------------------------------------------------------------
-// Decoders
-// ----------------------------------------------------------------------
-
 class DecoderImpl : virtual public Decoder {
  public:
   void SetData(int num_values, const uint8_t* data, int len) override {
@@ -978,9 +89,35 @@ class DecoderImpl : virtual public Decoder {
 };
 
 template <typename DType>
-class PlainDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+class TypedDecoderImpl : virtual public TypedDecoder<DType> {
  public:
   using T = typename DType::c_type;
+
+  int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits,
+                   int64_t valid_bits_offset) override {
+    if (null_count > 0) {
+      int values_to_read = num_values - null_count;
+      int values_read = this->Decode(buffer, values_to_read);
+      if (values_read != values_to_read) {
+        throw ParquetException("Number of values / definition_levels read did not match");
+      }
+
+      return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
+                                                      valid_bits, valid_bits_offset);
+    } else {
+      return this->Decode(buffer, num_values);
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
+// PLAIN decoder
+
+template <typename DType>
+class PlainDecoder : public DecoderImpl, virtual public TypedDecoderImpl<DType> {
+ public:
+  using T = typename DType::c_type;
+
   explicit PlainDecoder(const ColumnDescriptor* descr);
 
   int Decode(T* buffer, int max_values) override;
@@ -1156,7 +293,11 @@ int PlainDecoder<DType>::Decode(T* buffer, int max_values) {
   return max_values;
 }
 
-class PlainBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+// PLAIN decoder implementation for BOOLEAN
+
+class PlainBooleanDecoder : public DecoderImpl,
+                            virtual public TypedDecoderImpl<BooleanType>,
+                            virtual public BooleanDecoder {
  public:
   explicit PlainBooleanDecoder(const ColumnDescriptor* descr);
   void SetData(int num_values, const uint8_t* data, int len) override;
@@ -1273,6 +414,8 @@ int PlainBooleanDecoder::Decode(bool* buffer, int max_values) {
   return max_values;
 }
 
+// PLAIN decoder implementation for FIXED_LEN_BYTE_ARRAY and BYTE_ARRAY
+
 // A helper class to abstract away differences between EncodingTraits<DType>::Accumulator
 // for ByteArrayType and FLBAType.
 template <typename DType>
@@ -1592,7 +735,7 @@ class PlainFLBADecoder : public PlainDecoder<FLBAType>, virtual public FLBADecod
 };
 
 // ----------------------------------------------------------------------
-// Dictionary encoding and decoding
+// Dictionary decoding
 
 template <typename Type>
 class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
@@ -2167,329 +1310,42 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
         ++num_appended;
       }
     }
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
-
-  template <typename BuilderType>
-  Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
-    constexpr int32_t kBufferSize = 2048;
-    int32_t indices[kBufferSize];
-
-    RETURN_NOT_OK(builder->Reserve(num_values));
-
-    const auto* dict_values = dictionary_->data_as<ByteArray>();
-
-    int values_decoded = 0;
-    while (values_decoded < num_values) {
-      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
-      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-      if (num_indices == 0) ParquetException::EofException();
-      for (int i = 0; i < num_indices; ++i) {
-        auto idx = indices[i];
-        RETURN_NOT_OK(IndexInBounds(idx));
-        const auto& val = dict_values[idx];
-        RETURN_NOT_OK(builder->Append(val.ptr, val.len));
-      }
-      values_decoded += num_indices;
-    }
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
-};
-
-// ----------------------------------------------------------------------
-// DeltaBitPackEncoder
-
-/// DeltaBitPackEncoder is an encoder for the DeltaBinary Packing format
-/// as per the parquet spec. See:
-/// https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5
-///
-/// Consists of a header followed by blocks of delta encoded values binary packed.
-///
-///  Format
-///    [header] [block 1] [block 2] ... [block N]
-///
-///  Header
-///    [block size] [number of mini blocks per block] [total value count] [first value]
-///
-///  Block
-///    [min delta] [list of bitwidths of the mini blocks] [miniblocks]
-///
-/// Sets aside bytes at the start of the internal buffer where the header will be written,
-/// and only writes the header when FlushValues is called before returning it.
-///
-/// To encode a block, we will:
-///
-/// 1. Compute the differences between consecutive elements. For the first element in the
-/// block, use the last element in the previous block or, in the case of the first block,
-/// use the first value of the whole sequence, stored in the header.
-///
-/// 2. Compute the frame of reference (the minimum of the deltas in the block). Subtract
-/// this min delta from all deltas in the block. This guarantees that all values are
-/// non-negative.
-///
-/// 3. Encode the frame of reference (min delta) as a zigzag ULEB128 int followed by the
-/// bit widths of the mini blocks and the delta values (minus the min delta) bit packed
-/// per mini block.
-///
-/// Supports only INT32 and INT64.
-
-template <typename DType>
-class DeltaBitPackEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
-  // Maximum possible header size
-  static constexpr uint32_t kMaxPageHeaderWriterSize = 32;
-  static constexpr uint32_t kValuesPerBlock =
-      std::is_same_v<int32_t, typename DType::c_type> ? 128 : 256;
-  static constexpr uint32_t kMiniBlocksPerBlock = 4;
-
- public:
-  using T = typename DType::c_type;
-  using UT = std::make_unsigned_t<T>;
-  using TypedEncoder<DType>::Put;
-
-  explicit DeltaBitPackEncoder(const ColumnDescriptor* descr, MemoryPool* pool,
-                               const uint32_t values_per_block = kValuesPerBlock,
-                               const uint32_t mini_blocks_per_block = kMiniBlocksPerBlock)
-      : EncoderImpl(descr, Encoding::DELTA_BINARY_PACKED, pool),
-        values_per_block_(values_per_block),
-        mini_blocks_per_block_(mini_blocks_per_block),
-        values_per_mini_block_(values_per_block / mini_blocks_per_block),
-        deltas_(values_per_block, ::arrow::stl::allocator<T>(pool)),
-        bits_buffer_(
-            AllocateBuffer(pool, (kMiniBlocksPerBlock + values_per_block) * sizeof(T))),
-        sink_(pool),
-        bit_writer_(bits_buffer_->mutable_data(),
-                    static_cast<int>(bits_buffer_->size())) {
-    if (values_per_block_ % 128 != 0) {
-      throw ParquetException(
-          "the number of values in a block must be multiple of 128, but it's " +
-          std::to_string(values_per_block_));
-    }
-    if (values_per_mini_block_ % 32 != 0) {
-      throw ParquetException(
-          "the number of values in a miniblock must be multiple of 32, but it's " +
-          std::to_string(values_per_mini_block_));
-    }
-    if (values_per_block % mini_blocks_per_block != 0) {
-      throw ParquetException(
-          "the number of values per block % number of miniblocks per block must be 0, "
-          "but it's " +
-          std::to_string(values_per_block % mini_blocks_per_block));
-    }
-    // Reserve enough space at the beginning of the buffer for largest possible header.
-    PARQUET_THROW_NOT_OK(sink_.Advance(kMaxPageHeaderWriterSize));
-  }
-
-  std::shared_ptr<Buffer> FlushValues() override;
-
-  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
-
-  void Put(const ::arrow::Array& values) override;
-
-  void Put(const T* buffer, int num_values) override;
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override;
-
-  void FlushBlock();
-
- private:
-  const uint32_t values_per_block_;
-  const uint32_t mini_blocks_per_block_;
-  const uint32_t values_per_mini_block_;
-  uint32_t values_current_block_{0};
-  uint32_t total_value_count_{0};
-  T first_value_{0};
-  T current_value_{0};
-  ArrowPoolVector<T> deltas_;
-  std::shared_ptr<ResizableBuffer> bits_buffer_;
-  ::arrow::BufferBuilder sink_;
-  ::arrow::bit_util::BitWriter bit_writer_;
-};
-
-template <typename DType>
-void DeltaBitPackEncoder<DType>::Put(const T* src, int num_values) {
-  if (num_values == 0) {
-    return;
-  }
-
-  int idx = 0;
-  if (total_value_count_ == 0) {
-    current_value_ = src[0];
-    first_value_ = current_value_;
-    idx = 1;
-  }
-  total_value_count_ += num_values;
-
-  while (idx < num_values) {
-    T value = src[idx];
-    // Calculate deltas. The possible overflow is handled by use of unsigned integers
-    // making subtraction operations well-defined and correct even in case of overflow.
-    // Encoded integers will wrap back around on decoding.
-    // See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n
-    deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_);
-    current_value_ = value;
-    idx++;
-    values_current_block_++;
-    if (values_current_block_ == values_per_block_) {
-      FlushBlock();
-    }
-  }
-}
-
-template <typename DType>
-void DeltaBitPackEncoder<DType>::FlushBlock() {
-  if (values_current_block_ == 0) {
-    return;
-  }
-
-  // Calculate the frame of reference for this miniblock. This value will be subtracted
-  // from all deltas to guarantee all deltas are positive for encoding.
-  const T min_delta =
-      *std::min_element(deltas_.begin(), deltas_.begin() + values_current_block_);
-  bit_writer_.PutZigZagVlqInt(min_delta);
-
-  // Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to write
-  // bit widths of miniblocks as they become known during the encoding.
-  uint8_t* bit_width_data = bit_writer_.GetNextBytePtr(mini_blocks_per_block_);
-  DCHECK(bit_width_data != nullptr);
-
-  const uint32_t num_miniblocks =
-      static_cast<uint32_t>(std::ceil(static_cast<double>(values_current_block_) /
-                                      static_cast<double>(values_per_mini_block_)));
-  for (uint32_t i = 0; i < num_miniblocks; i++) {
-    const uint32_t values_current_mini_block =
-        std::min(values_per_mini_block_, values_current_block_);
-
-    const uint32_t start = i * values_per_mini_block_;
-    const T max_delta = *std::max_element(
-        deltas_.begin() + start, deltas_.begin() + start + values_current_mini_block);
-
-    // The minimum number of bits required to write any of values in deltas_ vector.
-    // See overflow comment above.
-    const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits(
-        static_cast<UT>(max_delta) - static_cast<UT>(min_delta));
-
-    for (uint32_t j = start; j < start + values_current_mini_block; j++) {
-      // Convert delta to frame of reference. See overflow comment above.
-      const UT value = static_cast<UT>(deltas_[j]) - static_cast<UT>(min_delta);
-      bit_writer_.PutValue(value, bit_width);
-    }
-    // If there are not enough values to fill the last mini block, we pad the mini block
-    // with zeroes so that its length is the number of values in a full mini block
-    // multiplied by the bit width.
-    for (uint32_t j = values_current_mini_block; j < values_per_mini_block_; j++) {
-      bit_writer_.PutValue(0, bit_width);
-    }
-    values_current_block_ -= values_current_mini_block;
-  }
-
-  // If, in the last block, less than <number of miniblocks in a block> miniblocks are
-  // needed to store the values, the bytes storing the bit widths of the unneeded
-  // miniblocks are still present, their value should be zero, but readers must accept
-  // arbitrary values as well.
-  for (uint32_t i = num_miniblocks; i < mini_blocks_per_block_; i++) {
-    bit_width_data[i] = 0;
-  }
-  DCHECK_EQ(values_current_block_, 0);
-
-  bit_writer_.Flush();
-  PARQUET_THROW_NOT_OK(sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
-  bit_writer_.Clear();
-}
-
-template <typename DType>
-std::shared_ptr<Buffer> DeltaBitPackEncoder<DType>::FlushValues() {
-  if (values_current_block_ > 0) {
-    FlushBlock();
-  }
-  PARQUET_ASSIGN_OR_THROW(auto buffer, sink_.Finish(/*shrink_to_fit=*/true));
-
-  uint8_t header_buffer_[kMaxPageHeaderWriterSize] = {};
-  bit_util::BitWriter header_writer(header_buffer_, sizeof(header_buffer_));
-  if (!header_writer.PutVlqInt(values_per_block_) ||
-      !header_writer.PutVlqInt(mini_blocks_per_block_) ||
-      !header_writer.PutVlqInt(total_value_count_) ||
-      !header_writer.PutZigZagVlqInt(static_cast<T>(first_value_))) {
-    throw ParquetException("header writing error");
-  }
-  header_writer.Flush();
-
-  // We reserved enough space at the beginning of the buffer for largest possible header
-  // and data was written immediately after. We now write the header data immediately
-  // before the end of reserved space.
-  const size_t offset_bytes = kMaxPageHeaderWriterSize - header_writer.bytes_written();
-  std::memcpy(buffer->mutable_data() + offset_bytes, header_buffer_,
-              header_writer.bytes_written());
-
-  // Reset counter of cached values
-  total_value_count_ = 0;
-  // Reserve enough space at the beginning of the buffer for largest possible header.
-  PARQUET_THROW_NOT_OK(sink_.Advance(kMaxPageHeaderWriterSize));
-
-  // Excess bytes at the beginning are sliced off and ignored.
-  return SliceBuffer(buffer, offset_bytes);
-}
-
-template <>
-void DeltaBitPackEncoder<Int32Type>::Put(const ::arrow::Array& values) {
-  const ::arrow::ArrayData& data = *values.data();
-  if (values.type_id() != ::arrow::Type::INT32) {
-    throw ParquetException("Expected Int32TArray, got ", values.type()->ToString());
-  }
-  if (data.length > std::numeric_limits<int32_t>::max()) {
-    throw ParquetException("Array cannot be longer than ",
-                           std::numeric_limits<int32_t>::max());
+    *out_num_values = values_decoded;
+    return Status::OK();
   }
 
-  if (values.null_count() == 0) {
-    Put(data.GetValues<int32_t>(1), static_cast<int>(data.length));
-  } else {
-    PutSpaced(data.GetValues<int32_t>(1), static_cast<int>(data.length),
-              data.GetValues<uint8_t>(0, 0), data.offset);
-  }
-}
+  template <typename BuilderType>
+  Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
+    constexpr int32_t kBufferSize = 2048;
+    int32_t indices[kBufferSize];
 
-template <>
-void DeltaBitPackEncoder<Int64Type>::Put(const ::arrow::Array& values) {
-  const ::arrow::ArrayData& data = *values.data();
-  if (values.type_id() != ::arrow::Type::INT64) {
-    throw ParquetException("Expected Int64TArray, got ", values.type()->ToString());
-  }
-  if (data.length > std::numeric_limits<int32_t>::max()) {
-    throw ParquetException("Array cannot be longer than ",
-                           std::numeric_limits<int32_t>::max());
-  }
-  if (values.null_count() == 0) {
-    Put(data.GetValues<int64_t>(1), static_cast<int>(data.length));
-  } else {
-    PutSpaced(data.GetValues<int64_t>(1), static_cast<int>(data.length),
-              data.GetValues<uint8_t>(0, 0), data.offset);
-  }
-}
+    RETURN_NOT_OK(builder->Reserve(num_values));
 
-template <typename DType>
-void DeltaBitPackEncoder<DType>::PutSpaced(const T* src, int num_values,
-                                           const uint8_t* valid_bits,
-                                           int64_t valid_bits_offset) {
-  if (valid_bits != NULLPTR) {
-    PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                 this->memory_pool()));
-    T* data = buffer->template mutable_data_as<T>();
-    int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-        src, num_values, valid_bits, valid_bits_offset, data);
-    Put(data, num_valid_values);
-  } else {
-    Put(src, num_values);
+    const auto* dict_values = dictionary_->data_as<ByteArray>();
+
+    int values_decoded = 0;
+    while (values_decoded < num_values) {
+      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+      if (num_indices == 0) ParquetException::EofException();
+      for (int i = 0; i < num_indices; ++i) {
+        auto idx = indices[i];
+        RETURN_NOT_OK(IndexInBounds(idx));
+        const auto& val = dict_values[idx];
+        RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+      }
+      values_decoded += num_indices;
+    }
+    *out_num_values = values_decoded;
+    return Status::OK();
   }
-}
+};
 
 // ----------------------------------------------------------------------
-// DeltaBitPackDecoder
+// DELTA_BINARY_PACKED decoder
 
 template <typename DType>
-class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+class DeltaBitPackDecoder : public DecoderImpl, public TypedDecoderImpl<DType> {
  public:
   typedef typename DType::c_type T;
   using UT = std::make_unsigned_t<T>;
@@ -2727,135 +1583,10 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
 };
 
 // ----------------------------------------------------------------------
-// DELTA_LENGTH_BYTE_ARRAY
-
-// ----------------------------------------------------------------------
-// DeltaLengthByteArrayEncoder
-
-class DeltaLengthByteArrayEncoder : public EncoderImpl,
-                                    virtual public TypedEncoder<ByteArrayType> {
- public:
-  explicit DeltaLengthByteArrayEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
-      : EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY,
-                    pool = ::arrow::default_memory_pool()),
-        sink_(pool),
-        length_encoder_(nullptr, pool) {}
-
-  std::shared_ptr<Buffer> FlushValues() override;
-
-  int64_t EstimatedDataEncodedSize() override {
-    return sink_.length() + length_encoder_.EstimatedDataEncodedSize();
-  }
-
-  using TypedEncoder<ByteArrayType>::Put;
-
-  void Put(const ::arrow::Array& values) override;
-
-  void Put(const T* buffer, int num_values) override;
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override;
-
- protected:
-  template <typename ArrayType>
-  void PutBinaryArray(const ArrayType& array) {
-    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
-        *array.data(),
-        [&](::std::string_view view) {
-          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
-            return Status::Invalid(
-                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
-          }
-          if (ARROW_PREDICT_FALSE(
-                  view.size() + sink_.length() >
-                  static_cast<size_t>(std::numeric_limits<int32_t>::max()))) {
-            return Status::Invalid("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
-          }
-          length_encoder_.Put({static_cast<int32_t>(view.length())}, 1);
-          PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length()));
-          return Status::OK();
-        },
-        []() { return Status::OK(); }));
-  }
-
-  ::arrow::BufferBuilder sink_;
-  DeltaBitPackEncoder<Int32Type> length_encoder_;
-};
-
-void DeltaLengthByteArrayEncoder::Put(const ::arrow::Array& values) {
-  AssertBaseBinary(values);
-  if (::arrow::is_binary_like(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
-  } else {
-    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
-  }
-}
-
-void DeltaLengthByteArrayEncoder::Put(const T* src, int num_values) {
-  if (num_values == 0) {
-    return;
-  }
-
-  constexpr int kBatchSize = 256;
-  std::array<int32_t, kBatchSize> lengths;
-  uint32_t total_increment_size = 0;
-  for (int idx = 0; idx < num_values; idx += kBatchSize) {
-    const int batch_size = std::min(kBatchSize, num_values - idx);
-    for (int j = 0; j < batch_size; ++j) {
-      const int32_t len = src[idx + j].len;
-      if (ARROW_PREDICT_FALSE(
-              AddWithOverflow(total_increment_size, len, &total_increment_size))) {
-        throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
-      }
-      lengths[j] = len;
-    }
-    length_encoder_.Put(lengths.data(), batch_size);
-  }
-  if (sink_.length() + total_increment_size > std::numeric_limits<int32_t>::max()) {
-    throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
-  }
-  PARQUET_THROW_NOT_OK(sink_.Reserve(total_increment_size));
-  for (int idx = 0; idx < num_values; idx++) {
-    sink_.UnsafeAppend(src[idx].ptr, src[idx].len);
-  }
-}
-
-void DeltaLengthByteArrayEncoder::PutSpaced(const T* src, int num_values,
-                                            const uint8_t* valid_bits,
-                                            int64_t valid_bits_offset) {
-  if (valid_bits != NULLPTR) {
-    PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                 this->memory_pool()));
-    T* data = buffer->template mutable_data_as<T>();
-    int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-        src, num_values, valid_bits, valid_bits_offset, data);
-    Put(data, num_valid_values);
-  } else {
-    Put(src, num_values);
-  }
-}
-
-std::shared_ptr<Buffer> DeltaLengthByteArrayEncoder::FlushValues() {
-  std::shared_ptr<Buffer> encoded_lengths = length_encoder_.FlushValues();
-
-  std::shared_ptr<Buffer> data;
-  PARQUET_THROW_NOT_OK(sink_.Finish(&data));
-  sink_.Reset();
-
-  PARQUET_THROW_NOT_OK(sink_.Resize(encoded_lengths->size() + data->size()));
-  PARQUET_THROW_NOT_OK(sink_.Append(encoded_lengths->data(), encoded_lengths->size()));
-  PARQUET_THROW_NOT_OK(sink_.Append(data->data(), data->size()));
-
-  std::shared_ptr<Buffer> buffer;
-  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true));
-  return buffer;
-}
-
-// ----------------------------------------------------------------------
-// DeltaLengthByteArrayDecoder
+// DELTA_LENGTH_BYTE_ARRAY decoder
 
 class DeltaLengthByteArrayDecoder : public DecoderImpl,
-                                    virtual public TypedDecoder<ByteArrayType> {
+                                    public TypedDecoderImpl<ByteArrayType> {
  public:
   explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr,
                                        MemoryPool* pool = ::arrow::default_memory_pool())
@@ -2989,113 +1720,11 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
 };
 
 // ----------------------------------------------------------------------
-// RLE_BOOLEAN_ENCODER
-
-class RleBooleanEncoder final : public EncoderImpl, virtual public BooleanEncoder {
- public:
-  explicit RleBooleanEncoder(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
-      : EncoderImpl(descr, Encoding::RLE, pool),
-        buffered_append_values_(::arrow::stl::allocator<T>(pool)) {}
-
-  int64_t EstimatedDataEncodedSize() override {
-    return kRleLengthInBytes + MaxRleBufferSize();
-  }
-
-  std::shared_ptr<Buffer> FlushValues() override;
-
-  void Put(const T* buffer, int num_values) override;
-  void Put(const ::arrow::Array& values) override {
-    if (values.type_id() != ::arrow::Type::BOOL) {
-      throw ParquetException("RleBooleanEncoder expects BooleanArray, got ",
-                             values.type()->ToString());
-    }
-    const auto& boolean_array = checked_cast<const ::arrow::BooleanArray&>(values);
-    if (values.null_count() == 0) {
-      for (int i = 0; i < boolean_array.length(); ++i) {
-        // null_count == 0, so just call Value directly is ok.
-        buffered_append_values_.push_back(boolean_array.Value(i));
-      }
-    } else {
-      PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::BooleanType>(
-          *boolean_array.data(),
-          [&](bool value) {
-            buffered_append_values_.push_back(value);
-            return Status::OK();
-          },
-          []() { return Status::OK(); }));
-    }
-  }
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    if (valid_bits != NULLPTR) {
-      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                   this->memory_pool()));
-      T* data = buffer->mutable_data_as<T>();
-      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-          src, num_values, valid_bits, valid_bits_offset, data);
-      Put(data, num_valid_values);
-    } else {
-      Put(src, num_values);
-    }
-  }
-
-  void Put(const std::vector<bool>& src, int num_values) override;
-
- protected:
-  template <typename SequenceType>
-  void PutImpl(const SequenceType& src, int num_values);
-
-  int MaxRleBufferSize() const noexcept {
-    return RlePreserveBufferSize(static_cast<int>(buffered_append_values_.size()),
-                                 kBitWidth);
-  }
-
-  constexpr static int32_t kBitWidth = 1;
-  /// 4 bytes in little-endian, which indicates the length.
-  constexpr static int32_t kRleLengthInBytes = 4;
-
-  // std::vector<bool> in C++ is tricky, because it's a bitmap.
-  // Here RleBooleanEncoder will only append values into it, and
-  // dump values into Buffer, so using it here is ok.
-  ArrowPoolVector<bool> buffered_append_values_;
-};
-
-void RleBooleanEncoder::Put(const bool* src, int num_values) { PutImpl(src, num_values); }
-
-void RleBooleanEncoder::Put(const std::vector<bool>& src, int num_values) {
-  PutImpl(src, num_values);
-}
-
-template <typename SequenceType>
-void RleBooleanEncoder::PutImpl(const SequenceType& src, int num_values) {
-  for (int i = 0; i < num_values; ++i) {
-    buffered_append_values_.push_back(src[i]);
-  }
-}
-
-std::shared_ptr<Buffer> RleBooleanEncoder::FlushValues() {
-  int rle_buffer_size_max = MaxRleBufferSize();
-  std::shared_ptr<ResizableBuffer> buffer =
-      AllocateBuffer(this->pool_, rle_buffer_size_max + kRleLengthInBytes);
-  ::arrow::util::RleEncoder encoder(buffer->mutable_data() + kRleLengthInBytes,
-                                    rle_buffer_size_max, /*bit_width*/ kBitWidth);
-
-  for (bool value : buffered_append_values_) {
-    encoder.Put(value ? 1 : 0);
-  }
-  encoder.Flush();
-  ::arrow::util::SafeStore(buffer->mutable_data(),
-                           ::arrow::bit_util::ToLittleEndian(encoder.len()));
-  PARQUET_THROW_NOT_OK(buffer->Resize(kRleLengthInBytes + encoder.len()));
-  buffered_append_values_.clear();
-  return buffer;
-}
-
-// ----------------------------------------------------------------------
-// RLE_BOOLEAN_DECODER
+// RLE decoder for BOOLEAN
 
-class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+class RleBooleanDecoder : public DecoderImpl,
+                          virtual public TypedDecoderImpl<BooleanType>,
+                          virtual public BooleanDecoder {
  public:
   explicit RleBooleanDecoder(const ColumnDescriptor* descr)
       : DecoderImpl(descr, Encoding::RLE) {}
@@ -3209,235 +1838,10 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
 };
 
 // ----------------------------------------------------------------------
-// DELTA_BYTE_ARRAY
-
-/// Delta Byte Array encoding also known as incremental encoding or front compression:
-/// for each element in a sequence of strings, store the prefix length of the previous
-/// entry plus the suffix.
-///
-/// This is stored as a sequence of delta-encoded prefix lengths (DELTA_BINARY_PACKED),
-/// followed by the suffixes encoded as delta length byte arrays
-/// (DELTA_LENGTH_BYTE_ARRAY).
-
-// ----------------------------------------------------------------------
-// DeltaByteArrayEncoder
-
-template <typename DType>
-class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
-  static constexpr std::string_view kEmpty = "";
-
- public:
-  using T = typename DType::c_type;
-
-  explicit DeltaByteArrayEncoder(const ColumnDescriptor* descr,
-                                 MemoryPool* pool = ::arrow::default_memory_pool())
-      : EncoderImpl(descr, Encoding::DELTA_BYTE_ARRAY, pool),
-        sink_(pool),
-        prefix_length_encoder_(/*descr=*/nullptr, pool),
-        suffix_encoder_(descr, pool),
-        last_value_(""),
-        empty_(static_cast<uint32_t>(kEmpty.size()),
-               reinterpret_cast<const uint8_t*>(kEmpty.data())) {}
-
-  std::shared_ptr<Buffer> FlushValues() override;
-
-  int64_t EstimatedDataEncodedSize() override {
-    return prefix_length_encoder_.EstimatedDataEncodedSize() +
-           suffix_encoder_.EstimatedDataEncodedSize();
-  }
-
-  using TypedEncoder<DType>::Put;
-
-  void Put(const ::arrow::Array& values) override;
-
-  void Put(const T* buffer, int num_values) override;
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    if (valid_bits != nullptr) {
-      if (buffer_ == nullptr) {
-        PARQUET_ASSIGN_OR_THROW(buffer_,
-                                ::arrow::AllocateResizableBuffer(num_values * sizeof(T),
-                                                                 this->memory_pool()));
-      } else {
-        PARQUET_THROW_NOT_OK(buffer_->Resize(num_values * sizeof(T), false));
-      }
-      T* data = buffer_->mutable_data_as<T>();
-      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-          src, num_values, valid_bits, valid_bits_offset, data);
-      Put(data, num_valid_values);
-    } else {
-      Put(src, num_values);
-    }
-  }
-
- protected:
-  template <typename VisitorType>
-  void PutInternal(const T* src, int num_values, const VisitorType visitor) {
-    if (num_values == 0) {
-      return;
-    }
-
-    std::string_view last_value_view = last_value_;
-    constexpr int kBatchSize = 256;
-    std::array<int32_t, kBatchSize> prefix_lengths;
-    std::array<ByteArray, kBatchSize> suffixes;
-
-    for (int i = 0; i < num_values; i += kBatchSize) {
-      const int batch_size = std::min(kBatchSize, num_values - i);
-
-      for (int j = 0; j < batch_size; ++j) {
-        const int idx = i + j;
-        const auto view = visitor[idx];
-        const auto len = static_cast<const uint32_t>(view.length());
-
-        uint32_t common_prefix_length = 0;
-        const uint32_t maximum_common_prefix_length =
-            std::min(len, static_cast<uint32_t>(last_value_view.length()));
-        while (common_prefix_length < maximum_common_prefix_length) {
-          if (last_value_view[common_prefix_length] != view[common_prefix_length]) {
-            break;
-          }
-          common_prefix_length++;
-        }
-
-        last_value_view = view;
-        prefix_lengths[j] = common_prefix_length;
-        const uint32_t suffix_length = len - common_prefix_length;
-        const uint8_t* suffix_ptr = src[idx].ptr + common_prefix_length;
-
-        // Convert to ByteArray, so it can be passed to the suffix_encoder_.
-        const ByteArray suffix(suffix_length, suffix_ptr);
-        suffixes[j] = suffix;
-      }
-      suffix_encoder_.Put(suffixes.data(), batch_size);
-      prefix_length_encoder_.Put(prefix_lengths.data(), batch_size);
-    }
-    last_value_ = last_value_view;
-  }
-
-  template <typename ArrayType>
-  void PutBinaryArray(const ArrayType& array) {
-    auto previous_len = static_cast<uint32_t>(last_value_.length());
-    std::string_view last_value_view = last_value_;
-
-    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
-        *array.data(),
-        [&](::std::string_view view) {
-          if (ARROW_PREDICT_FALSE(view.size() >= kMaxByteArraySize)) {
-            return Status::Invalid(
-                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
-          }
-          const ByteArray src{view};
-
-          uint32_t common_prefix_length = 0;
-          const uint32_t len = src.len;
-          const uint32_t maximum_common_prefix_length = std::min(previous_len, len);
-          while (common_prefix_length < maximum_common_prefix_length) {
-            if (last_value_view[common_prefix_length] != view[common_prefix_length]) {
-              break;
-            }
-            common_prefix_length++;
-          }
-          previous_len = len;
-          prefix_length_encoder_.Put({static_cast<int32_t>(common_prefix_length)}, 1);
-
-          last_value_view = view;
-          const auto suffix_length = static_cast<uint32_t>(len - common_prefix_length);
-          if (suffix_length == 0) {
-            suffix_encoder_.Put(&empty_, 1);
-            return Status::OK();
-          }
-          const uint8_t* suffix_ptr = src.ptr + common_prefix_length;
-          // Convert to ByteArray, so it can be passed to the suffix_encoder_.
-          const ByteArray suffix(suffix_length, suffix_ptr);
-          suffix_encoder_.Put(&suffix, 1);
-
-          return Status::OK();
-        },
-        []() { return Status::OK(); }));
-    last_value_ = last_value_view;
-  }
-
-  ::arrow::BufferBuilder sink_;
-  DeltaBitPackEncoder<Int32Type> prefix_length_encoder_;
-  DeltaLengthByteArrayEncoder suffix_encoder_;
-  std::string last_value_;
-  const ByteArray empty_;
-  std::unique_ptr<ResizableBuffer> buffer_;
-};
-
-struct ByteArrayVisitor {
-  const ByteArray* src;
-
-  std::string_view operator[](int i) const {
-    if (ARROW_PREDICT_FALSE(src[i].len >= kMaxByteArraySize)) {
-      throw ParquetException("Parquet cannot store strings with size 2GB or more, got: ",
-                             src[i].len);
-    }
-    return std::string_view{src[i]};
-  }
-
-  uint32_t len(int i) const { return src[i].len; }
-};
-
-struct FLBAVisitor {
-  const FLBA* src;
-  const uint32_t type_length;
-
-  std::string_view operator[](int i) const {
-    return std::string_view{reinterpret_cast<const char*>(src[i].ptr), type_length};
-  }
-
-  uint32_t len(int i) const { return type_length; }
-};
-
-template <>
-void DeltaByteArrayEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
-  auto visitor = ByteArrayVisitor{src};
-  PutInternal<ByteArrayVisitor>(src, num_values, visitor);
-}
-
-template <>
-void DeltaByteArrayEncoder<FLBAType>::Put(const FLBA* src, int num_values) {
-  auto visitor = FLBAVisitor{src, static_cast<uint32_t>(descr_->type_length())};
-  PutInternal<FLBAVisitor>(src, num_values, visitor);
-}
-
-template <typename DType>
-void DeltaByteArrayEncoder<DType>::Put(const ::arrow::Array& values) {
-  if (::arrow::is_binary_like(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
-  } else if (::arrow::is_large_binary_like(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
-  } else if (::arrow::is_fixed_size_binary(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::FixedSizeBinaryArray&>(values));
-  } else {
-    throw ParquetException("Only BaseBinaryArray and subclasses supported");
-  }
-}
-
-template <typename DType>
-std::shared_ptr<Buffer> DeltaByteArrayEncoder<DType>::FlushValues() {
-  PARQUET_THROW_NOT_OK(sink_.Resize(EstimatedDataEncodedSize(), false));
-
-  std::shared_ptr<Buffer> prefix_lengths = prefix_length_encoder_.FlushValues();
-  PARQUET_THROW_NOT_OK(sink_.Append(prefix_lengths->data(), prefix_lengths->size()));
-
-  std::shared_ptr<Buffer> suffixes = suffix_encoder_.FlushValues();
-  PARQUET_THROW_NOT_OK(sink_.Append(suffixes->data(), suffixes->size()));
-
-  std::shared_ptr<Buffer> buffer;
-  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true));
-  last_value_.clear();
-  return buffer;
-}
-
-// ----------------------------------------------------------------------
-// DeltaByteArrayDecoder
+// DELTA_BYTE_ARRAY decoder
 
 template <typename DType>
-class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecoder<DType> {
+class DeltaByteArrayDecoderImpl : public DecoderImpl, public TypedDecoderImpl<DType> {
   using T = typename DType::c_type;
 
  public:
@@ -3575,7 +1979,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode
     }
     PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size));
 
-    string_view prefix{last_value_};
+    std::string_view prefix{last_value_};
     uint8_t* data_ptr = buffered_data_->mutable_data();
     if (max_values > 0) {
       BuildBufferInternal</*is_first_run=*/true>(prefix_len_ptr, 0, buffer, &prefix,
@@ -3683,8 +2087,7 @@ class DeltaByteArrayFLBADecoder : public DeltaByteArrayDecoderImpl<FLBAType>,
 // BYTE_STREAM_SPLIT decoders
 
 template <typename DType>
-class ByteStreamSplitDecoderBase : public DecoderImpl,
-                                   virtual public TypedDecoder<DType> {
+class ByteStreamSplitDecoderBase : public DecoderImpl, public TypedDecoderImpl<DType> {
  public:
   using T = typename DType::c_type;
 
@@ -3852,110 +2255,7 @@ class ByteStreamSplitDecoder<FLBAType> : public ByteStreamSplitDecoderBase<FLBAT
 }  // namespace
 
 // ----------------------------------------------------------------------
-// Encoder and decoder factory functions
-
-std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encoding,
-                                     bool use_dictionary, const ColumnDescriptor* descr,
-                                     MemoryPool* pool) {
-  if (use_dictionary) {
-    switch (type_num) {
-      case Type::INT32:
-        return std::make_unique<DictEncoderImpl<Int32Type>>(descr, pool);
-      case Type::INT64:
-        return std::make_unique<DictEncoderImpl<Int64Type>>(descr, pool);
-      case Type::INT96:
-        return std::make_unique<DictEncoderImpl<Int96Type>>(descr, pool);
-      case Type::FLOAT:
-        return std::make_unique<DictEncoderImpl<FloatType>>(descr, pool);
-      case Type::DOUBLE:
-        return std::make_unique<DictEncoderImpl<DoubleType>>(descr, pool);
-      case Type::BYTE_ARRAY:
-        return std::make_unique<DictEncoderImpl<ByteArrayType>>(descr, pool);
-      case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::make_unique<DictEncoderImpl<FLBAType>>(descr, pool);
-      default:
-        DCHECK(false) << "Encoder not implemented";
-        break;
-    }
-  } else if (encoding == Encoding::PLAIN) {
-    switch (type_num) {
-      case Type::BOOLEAN:
-        return std::make_unique<PlainEncoder<BooleanType>>(descr, pool);
-      case Type::INT32:
-        return std::make_unique<PlainEncoder<Int32Type>>(descr, pool);
-      case Type::INT64:
-        return std::make_unique<PlainEncoder<Int64Type>>(descr, pool);
-      case Type::INT96:
-        return std::make_unique<PlainEncoder<Int96Type>>(descr, pool);
-      case Type::FLOAT:
-        return std::make_unique<PlainEncoder<FloatType>>(descr, pool);
-      case Type::DOUBLE:
-        return std::make_unique<PlainEncoder<DoubleType>>(descr, pool);
-      case Type::BYTE_ARRAY:
-        return std::make_unique<PlainEncoder<ByteArrayType>>(descr, pool);
-      case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::make_unique<PlainEncoder<FLBAType>>(descr, pool);
-      default:
-        DCHECK(false) << "Encoder not implemented";
-        break;
-    }
-  } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
-    switch (type_num) {
-      case Type::INT32:
-        return std::make_unique<ByteStreamSplitEncoder<Int32Type>>(descr, pool);
-      case Type::INT64:
-        return std::make_unique<ByteStreamSplitEncoder<Int64Type>>(descr, pool);
-      case Type::FLOAT:
-        return std::make_unique<ByteStreamSplitEncoder<FloatType>>(descr, pool);
-      case Type::DOUBLE:
-        return std::make_unique<ByteStreamSplitEncoder<DoubleType>>(descr, pool);
-      case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::make_unique<ByteStreamSplitEncoder<FLBAType>>(descr, pool);
-      default:
-        throw ParquetException(
-            "BYTE_STREAM_SPLIT only supports FLOAT, DOUBLE, INT32, INT64 "
-            "and FIXED_LEN_BYTE_ARRAY");
-    }
-  } else if (encoding == Encoding::DELTA_BINARY_PACKED) {
-    switch (type_num) {
-      case Type::INT32:
-        return std::make_unique<DeltaBitPackEncoder<Int32Type>>(descr, pool);
-      case Type::INT64:
-        return std::make_unique<DeltaBitPackEncoder<Int64Type>>(descr, pool);
-      default:
-        throw ParquetException(
-            "DELTA_BINARY_PACKED encoder only supports INT32 and INT64");
-    }
-  } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) {
-    switch (type_num) {
-      case Type::BYTE_ARRAY:
-        return std::make_unique<DeltaLengthByteArrayEncoder>(descr, pool);
-      default:
-        throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY");
-    }
-  } else if (encoding == Encoding::RLE) {
-    switch (type_num) {
-      case Type::BOOLEAN:
-        return std::make_unique<RleBooleanEncoder>(descr, pool);
-      default:
-        throw ParquetException("RLE only supports BOOLEAN");
-    }
-  } else if (encoding == Encoding::DELTA_BYTE_ARRAY) {
-    switch (type_num) {
-      case Type::BYTE_ARRAY:
-        return std::make_unique<DeltaByteArrayEncoder<ByteArrayType>>(descr, pool);
-      case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::make_unique<DeltaByteArrayEncoder<FLBAType>>(descr, pool);
-      default:
-        throw ParquetException(
-            "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY");
-    }
-  } else {
-    ParquetException::NYI("Selected encoding is not supported");
-  }
-  DCHECK(false) << "Should not be able to reach this code";
-  return nullptr;
-}
+// Factory functions
 
 std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
                                      const ColumnDescriptor* descr,
diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc
new file mode 100644
index 00000000000..89d5d44c521
--- /dev/null
+++ b/cpp/src/parquet/encoder.cc
@@ -0,0 +1,1783 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encoding.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/stl_allocator.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_stream_utils_internal.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/byte_stream_split_internal.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle_encoding_internal.h"
+#include "arrow/util/spaced.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visit_data_inline.h"
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace bit_util = arrow::bit_util;
+
+using arrow::Status;
+using arrow::internal::AddWithOverflow;
+using arrow::internal::checked_cast;
+using arrow::internal::SafeSignedSubtract;
+using arrow::util::SafeLoad;
+using arrow::util::SafeLoadAs;
+
+template <typename T>
+using ArrowPoolVector = std::vector<T, ::arrow::stl::allocator<T>>;
+
+namespace parquet {
+namespace {
+
+// The Parquet spec isn't very clear whether ByteArray lengths are signed or
+// unsigned, but the Java implementation uses signed ints.
+constexpr size_t kMaxByteArraySize = std::numeric_limits<int32_t>::max();
+
+class EncoderImpl : virtual public Encoder {
+ public:
+  EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool)
+      : descr_(descr),
+        encoding_(encoding),
+        pool_(pool),
+        type_length_(descr ? descr->type_length() : -1) {}
+
+  Encoding::type encoding() const override { return encoding_; }
+
+  MemoryPool* memory_pool() const override { return pool_; }
+
+ protected:
+  // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
+  const ColumnDescriptor* descr_;
+  const Encoding::type encoding_;
+  MemoryPool* pool_;
+
+  /// Type length from descr
+  const int type_length_;
+};
+
+// ----------------------------------------------------------------------
+// PLAIN encoder
+
+template <typename DType>
+class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+ public:
+  using T = typename DType::c_type;
+
+  explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
+
+  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
+
+  std::shared_ptr<Buffer> FlushValues() override {
+    std::shared_ptr<Buffer> buffer;
+    PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
+    return buffer;
+  }
+
+  using TypedEncoder<DType>::Put;
+
+  void Put(const T* buffer, int num_values) override;
+
+  void Put(const ::arrow::Array& values) override;
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != NULLPTR) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                   this->memory_pool()));
+      T* data = buffer->template mutable_data_as<T>();
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+          src, num_values, valid_bits, valid_bits_offset, data);
+      Put(data, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+  void UnsafePutByteArray(const void* data, uint32_t length) {
+    DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL";
+    sink_.UnsafeAppend(&length, sizeof(uint32_t));
+    sink_.UnsafeAppend(data, static_cast<int64_t>(length));
+  }
+
+  void Put(const ByteArray& val) {
+    // Write the result to the output stream
+    const int64_t increment = static_cast<int64_t>(val.len + sizeof(uint32_t));
+    if (ARROW_PREDICT_FALSE(sink_.length() + increment > sink_.capacity())) {
+      PARQUET_THROW_NOT_OK(sink_.Reserve(increment));
+    }
+    UnsafePutByteArray(val.ptr, val.len);
+  }
+
+ protected:
+  template <typename ArrayType>
+  void PutBinaryArray(const ArrayType& array) {
+    const int64_t total_bytes =
+        array.value_offset(array.length()) - array.value_offset(0);
+    PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes + array.length() * sizeof(uint32_t)));
+
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
+        *array.data(),
+        [&](::std::string_view view) {
+          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+            return Status::Invalid(
+                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
+          }
+          UnsafePutByteArray(view.data(), static_cast<uint32_t>(view.size()));
+          return Status::OK();
+        },
+        []() { return Status::OK(); }));
+  }
+
+  ::arrow::BufferBuilder sink_;
+};
+
+template <typename DType>
+void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
+  if (num_values > 0) {
+    PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
+  }
+}
+
+template <>
+inline void PlainEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
+  for (int i = 0; i < num_values; ++i) {
+    Put(src[i]);
+  }
+}
+
+template <typename ArrayType>
+void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) {
+  if (values.type_id() != ArrayType::TypeClass::type_id) {
+    std::string type_name = ArrayType::TypeClass::type_name();
+    throw ParquetException("direct put to " + type_name + " from " +
+                           values.type()->ToString() + " not supported");
+  }
+
+  using value_type = typename ArrayType::value_type;
+  constexpr auto value_size = sizeof(value_type);
+  auto raw_values = checked_cast<const ArrayType&>(values).raw_values();
+
+  if (values.null_count() == 0) {
+    // no nulls, just dump the data
+    PARQUET_THROW_NOT_OK(sink->Append(raw_values, values.length() * value_size));
+  } else {
+    PARQUET_THROW_NOT_OK(
+        sink->Reserve((values.length() - values.null_count()) * value_size));
+
+    for (int64_t i = 0; i < values.length(); i++) {
+      if (values.IsValid(i)) {
+        sink->UnsafeAppend(&raw_values[i], value_size);
+      }
+    }
+  }
+}
+
+template <>
+void PlainEncoder<Int32Type>::Put(const ::arrow::Array& values) {
+  DirectPutImpl<::arrow::Int32Array>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<Int64Type>::Put(const ::arrow::Array& values) {
+  DirectPutImpl<::arrow::Int64Array>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<Int96Type>::Put(const ::arrow::Array& values) {
+  ParquetException::NYI("direct put to Int96");
+}
+
+template <>
+void PlainEncoder<FloatType>::Put(const ::arrow::Array& values) {
+  DirectPutImpl<::arrow::FloatArray>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<DoubleType>::Put(const ::arrow::Array& values) {
+  DirectPutImpl<::arrow::DoubleArray>(values, &sink_);
+}
+
+template <typename DType>
+void PlainEncoder<DType>::Put(const ::arrow::Array& values) {
+  ParquetException::NYI("direct put of " + values.type()->ToString());
+}
+
+void AssertBaseBinary(const ::arrow::Array& values) {
+  if (!::arrow::is_base_binary_like(values.type_id())) {
+    throw ParquetException("Only BaseBinaryArray and subclasses supported");
+  }
+}
+
+template <>
+inline void PlainEncoder<ByteArrayType>::Put(const ::arrow::Array& values) {
+  AssertBaseBinary(values);
+
+  if (::arrow::is_binary_like(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+  } else {
+    DCHECK(::arrow::is_large_binary_like(values.type_id()));
+    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+  }
+}
+
+void AssertFixedSizeBinary(const ::arrow::Array& values, int type_length) {
+  if (!::arrow::is_fixed_size_binary(values.type_id())) {
+    throw ParquetException("Only FixedSizeBinaryArray and subclasses supported");
+  }
+  if (checked_cast<const ::arrow::FixedSizeBinaryType&>(*values.type()).byte_width() !=
+      type_length) {
+    throw ParquetException("Size mismatch: " + values.type()->ToString() +
+                           " should have been " + std::to_string(type_length) + " wide");
+  }
+}
+
+template <>
+inline void PlainEncoder<FLBAType>::Put(const ::arrow::Array& values) {
+  AssertFixedSizeBinary(values, descr_->type_length());
+  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+
+  if (data.null_count() == 0) {
+    // no nulls, just dump the data
+    PARQUET_THROW_NOT_OK(
+        sink_.Append(data.raw_values(), data.length() * data.byte_width()));
+  } else {
+    const int64_t total_bytes =
+        data.length() * data.byte_width() - data.null_count() * data.byte_width();
+    PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
+    for (int64_t i = 0; i < data.length(); i++) {
+      if (data.IsValid(i)) {
+        sink_.UnsafeAppend(data.Value(i), data.byte_width());
+      }
+    }
+  }
+}
+
+template <>
+inline void PlainEncoder<FLBAType>::Put(const FixedLenByteArray* src, int num_values) {
+  if (descr_->type_length() == 0) {
+    return;
+  }
+  for (int i = 0; i < num_values; ++i) {
+    // Write the result to the output stream
+    DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL";
+    PARQUET_THROW_NOT_OK(sink_.Append(src[i].ptr, descr_->type_length()));
+  }
+}
+
+template <>
+class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEncoder {
+ public:
+  explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
+
+  int64_t EstimatedDataEncodedSize() override;
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  void Put(const bool* src, int num_values) override;
+
+  void Put(const std::vector<bool>& src, int num_values) override;
+
+  void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != NULLPTR) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                   this->memory_pool()));
+      T* data = buffer->mutable_data_as<T>();
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+          src, num_values, valid_bits, valid_bits_offset, data);
+      Put(data, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+  void Put(const ::arrow::Array& values) override {
+    if (values.type_id() != ::arrow::Type::BOOL) {
+      throw ParquetException("direct put to boolean from " + values.type()->ToString() +
+                             " not supported");
+    }
+    const auto& data = checked_cast<const ::arrow::BooleanArray&>(values);
+
+    if (data.null_count() == 0) {
+      // no nulls, just dump the data
+      PARQUET_THROW_NOT_OK(sink_.Reserve(data.length()));
+      sink_.UnsafeAppend(data.data()->GetValues<uint8_t>(1, 0), data.offset(),
+                         data.length());
+    } else {
+      PARQUET_THROW_NOT_OK(sink_.Reserve(data.length() - data.null_count()));
+      for (int64_t i = 0; i < data.length(); i++) {
+        if (data.IsValid(i)) {
+          sink_.UnsafeAppend(data.Value(i));
+        }
+      }
+    }
+  }
+
+ private:
+  ::arrow::TypedBufferBuilder<bool> sink_;
+
+  template <typename SequenceType>
+  void PutImpl(const SequenceType& src, int num_values);
+};
+
+template <typename SequenceType>
+void PlainEncoder<BooleanType>::PutImpl(const SequenceType& src, int num_values) {
+  PARQUET_THROW_NOT_OK(sink_.Reserve(num_values));
+  for (int i = 0; i < num_values; ++i) {
+    sink_.UnsafeAppend(src[i]);
+  }
+}
+
+int64_t PlainEncoder<BooleanType>::EstimatedDataEncodedSize() {
+  return ::arrow::bit_util::BytesForBits(sink_.length());
+}
+
+std::shared_ptr<Buffer> PlainEncoder<BooleanType>::FlushValues() {
+  std::shared_ptr<Buffer> buffer;
+  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
+  return buffer;
+}
+
+void PlainEncoder<BooleanType>::Put(const bool* src, int num_values) {
+  PutImpl(src, num_values);
+}
+
+void PlainEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
+  PutImpl(src, num_values);
+}
+
+// ----------------------------------------------------------------------
+// DictEncoder<T> implementations
+
+template <typename DType>
+struct DictEncoderTraits {
+  using c_type = typename DType::c_type;
+  using MemoTableType = ::arrow::internal::ScalarMemoTable<c_type>;
+};
+
+template <>
+struct DictEncoderTraits<ByteArrayType> {
+  using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
+};
+
+template <>
+struct DictEncoderTraits<FLBAType> {
+  using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
+};
+
+// Initially 1024 elements
+static constexpr int32_t kInitialHashTableSize = 1 << 10;
+
+int RlePreserveBufferSize(int num_values, int bit_width) {
+  // Note: because of the way RleEncoder::CheckBufferFull()
+  // is called, we have to reserve an extra "RleEncoder::MinBufferSize"
+  // bytes. These extra bytes won't be used but not reserving them
+  // would cause the encoder to fail.
+  return ::arrow::util::RleEncoder::MaxBufferSize(bit_width, num_values) +
+         ::arrow::util::RleEncoder::MinBufferSize(bit_width);
+}
+
+/// See the dictionary encoding section of
+/// https://github.com/Parquet/parquet-format.  The encoding supports
+/// streaming encoding. Values are encoded as they are added while the
+/// dictionary is being constructed. At any time, the buffered values
+/// can be written out with the current dictionary size. More values
+/// can then be added to the encoder, including new dictionary
+/// entries.
+template <typename DType>
+class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
+  using MemoTableType = typename DictEncoderTraits<DType>::MemoTableType;
+
+ public:
+  typedef typename DType::c_type T;
+
+  /// In data page, the bit width used to encode the entry
+  /// ids stored as 1 byte (max bit width = 32).
+  constexpr static int32_t kDataPageBitWidthBytes = 1;
+
+  explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool)
+      : EncoderImpl(desc, Encoding::RLE_DICTIONARY, pool),
+        buffered_indices_(::arrow::stl::allocator<int32_t>(pool)),
+        dict_encoded_size_(0),
+        memo_table_(pool, kInitialHashTableSize) {}
+
+  ~DictEncoderImpl() override = default;
+
+  int dict_encoded_size() const override { return dict_encoded_size_; }
+
+  int WriteIndices(uint8_t* buffer, int buffer_len) override {
+    // Write bit width in first byte
+    *buffer = static_cast<uint8_t>(bit_width());
+    ++buffer;
+    --buffer_len;
+
+    ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
+
+    for (int32_t index : buffered_indices_) {
+      if (ARROW_PREDICT_FALSE(!encoder.Put(index))) return -1;
+    }
+    encoder.Flush();
+
+    ClearIndices();
+    return kDataPageBitWidthBytes + encoder.len();
+  }
+
+  /// Returns a conservative estimate of the number of bytes needed to encode the buffered
+  /// indices. Used to size the buffer passed to WriteIndices().
+  int64_t EstimatedDataEncodedSize() override {
+    return kDataPageBitWidthBytes +
+           RlePreserveBufferSize(static_cast<int>(buffered_indices_.size()), bit_width());
+  }
+
+  /// The minimum bit width required to encode the currently buffered indices.
+  int bit_width() const override {
+    if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0;
+    if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1;
+    return bit_util::Log2(num_entries());
+  }
+
+  /// Encode value. Note that this does not actually write any data, just
+  /// buffers the value's index to be written later.
+  inline void Put(const T& value);
+
+  // Not implemented for other data types
+  inline void PutByteArray(const void* ptr, int32_t length);
+
+  void Put(const T* src, int num_values) override {
+    for (int32_t i = 0; i < num_values; i++) {
+      Put(SafeLoad(src + i));
+    }
+  }
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    ::arrow::internal::VisitSetBitRunsVoid(valid_bits, valid_bits_offset, num_values,
+                                           [&](int64_t position, int64_t length) {
+                                             for (int64_t i = 0; i < length; i++) {
+                                               Put(SafeLoad(src + i + position));
+                                             }
+                                           });
+  }
+
+  using TypedEncoder<DType>::Put;
+
+  void Put(const ::arrow::Array& values) override;
+  void PutDictionary(const ::arrow::Array& values) override;
+
+  template <typename ArrowType, typename T = typename ArrowType::c_type>
+  void PutIndicesTyped(const ::arrow::Array& data) {
+    auto values = data.data()->GetValues<T>(1);
+    size_t buffer_position = buffered_indices_.size();
+    buffered_indices_.resize(buffer_position +
+                             static_cast<size_t>(data.length() - data.null_count()));
+    ::arrow::internal::VisitSetBitRunsVoid(
+        data.null_bitmap_data(), data.offset(), data.length(),
+        [&](int64_t position, int64_t length) {
+          for (int64_t i = 0; i < length; ++i) {
+            buffered_indices_[buffer_position++] =
+                static_cast<int32_t>(values[i + position]);
+          }
+        });
+  }
+
+  void PutIndices(const ::arrow::Array& data) override {
+    switch (data.type()->id()) {
+      case ::arrow::Type::UINT8:
+      case ::arrow::Type::INT8:
+        return PutIndicesTyped<::arrow::UInt8Type>(data);
+      case ::arrow::Type::UINT16:
+      case ::arrow::Type::INT16:
+        return PutIndicesTyped<::arrow::UInt16Type>(data);
+      case ::arrow::Type::UINT32:
+      case ::arrow::Type::INT32:
+        return PutIndicesTyped<::arrow::UInt32Type>(data);
+      case ::arrow::Type::UINT64:
+      case ::arrow::Type::INT64:
+        return PutIndicesTyped<::arrow::UInt64Type>(data);
+      default:
+        throw ParquetException("Passed non-integer array to PutIndices");
+    }
+  }
+
+  std::shared_ptr<Buffer> FlushValues() override {
+    std::shared_ptr<ResizableBuffer> buffer =
+        AllocateBuffer(this->pool_, EstimatedDataEncodedSize());
+    int result_size = WriteIndices(buffer->mutable_data(),
+                                   static_cast<int>(EstimatedDataEncodedSize()));
+    PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false));
+    return buffer;
+  }
+
+  /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
+  /// dict_encoded_size() bytes.
+  void WriteDict(uint8_t* buffer) const override;
+
+  /// The number of entries in the dictionary.
+  int num_entries() const override { return memo_table_.size(); }
+
+ private:
+  /// Clears all the indices (but leaves the dictionary).
+  void ClearIndices() { buffered_indices_.clear(); }
+
+  /// Indices that have not yet be written out by WriteIndices().
+  ArrowPoolVector<int32_t> buffered_indices_;
+
+  template <typename ArrayType>
+  void PutBinaryArray(const ArrayType& array) {
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
+        *array.data(),
+        [&](::std::string_view view) {
+          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+            return Status::Invalid(
+                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
+          }
+          PutByteArray(view.data(), static_cast<uint32_t>(view.size()));
+          return Status::OK();
+        },
+        []() { return Status::OK(); }));
+  }
+
+  template <typename ArrayType>
+  void PutBinaryDictionaryArray(const ArrayType& array) {
+    DCHECK_EQ(array.null_count(), 0);
+    for (int64_t i = 0; i < array.length(); i++) {
+      auto v = array.GetView(i);
+      if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) {
+        throw ParquetException(
+            "Parquet cannot store strings with size 2GB or more, got: ", v.size());
+      }
+      dict_encoded_size_ += static_cast<int>(v.size() + sizeof(uint32_t));
+      int32_t unused_memo_index;
+      PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(
+          v.data(), static_cast<int32_t>(v.size()), &unused_memo_index));
+    }
+  }
+
+  /// The number of bytes needed to encode the dictionary.
+  int dict_encoded_size_;
+
+  MemoTableType memo_table_;
+};
+
+template <typename DType>
+void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) const {
+  // For primitive types, only a memcpy
+  DCHECK_EQ(static_cast<size_t>(dict_encoded_size_), sizeof(T) * memo_table_.size());
+  memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer));
+}
+
+// ByteArray and FLBA already have the dictionary encoded in their data heaps
+template <>
+void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) const {
+  memo_table_.VisitValues(0, [&buffer](::std::string_view v) {
+    uint32_t len = static_cast<uint32_t>(v.length());
+    memcpy(buffer, &len, sizeof(len));
+    buffer += sizeof(len);
+    memcpy(buffer, v.data(), len);
+    buffer += len;
+  });
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::WriteDict(uint8_t* buffer) const {
+  memo_table_.VisitValues(0, [&](::std::string_view v) {
+    DCHECK_EQ(v.length(), static_cast<size_t>(type_length_));
+    memcpy(buffer, v.data(), type_length_);
+    buffer += type_length_;
+  });
+}
+
+template <typename DType>
+inline void DictEncoderImpl<DType>::Put(const T& v) {
+  // Put() implementation for primitive types
+  auto on_found = [](int32_t memo_index) {};
+  auto on_not_found = [this](int32_t memo_index) {
+    dict_encoded_size_ += static_cast<int>(sizeof(T));
+  };
+
+  int32_t memo_index;
+  PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(v, on_found, on_not_found, &memo_index));
+  buffered_indices_.push_back(memo_index);
+}
+
+template <typename DType>
+inline void DictEncoderImpl<DType>::PutByteArray(const void* ptr, int32_t length) {
+  DCHECK(false);
+}
+
+template <>
+inline void DictEncoderImpl<ByteArrayType>::PutByteArray(const void* ptr,
+                                                         int32_t length) {
+  static const uint8_t empty[] = {0};
+
+  auto on_found = [](int32_t memo_index) {};
+  auto on_not_found = [&](int32_t memo_index) {
+    dict_encoded_size_ += static_cast<int>(length + sizeof(uint32_t));
+  };
+
+  DCHECK(ptr != nullptr || length == 0);
+  ptr = (ptr != nullptr) ? ptr : empty;
+  int32_t memo_index;
+  PARQUET_THROW_NOT_OK(
+      memo_table_.GetOrInsert(ptr, length, on_found, on_not_found, &memo_index));
+  buffered_indices_.push_back(memo_index);
+}
+
+template <>
+inline void DictEncoderImpl<ByteArrayType>::Put(const ByteArray& val) {
+  return PutByteArray(val.ptr, static_cast<int32_t>(val.len));
+}
+
+template <>
+inline void DictEncoderImpl<FLBAType>::Put(const FixedLenByteArray& v) {
+  static const uint8_t empty[] = {0};
+
+  auto on_found = [](int32_t memo_index) {};
+  auto on_not_found = [this](int32_t memo_index) { dict_encoded_size_ += type_length_; };
+
+  DCHECK(v.ptr != nullptr || type_length_ == 0);
+  const void* ptr = (v.ptr != nullptr) ? v.ptr : empty;
+  int32_t memo_index;
+  PARQUET_THROW_NOT_OK(
+      memo_table_.GetOrInsert(ptr, type_length_, on_found, on_not_found, &memo_index));
+  buffered_indices_.push_back(memo_index);
+}
+
+template <>
+void DictEncoderImpl<Int96Type>::Put(const ::arrow::Array& values) {
+  ParquetException::NYI("Direct put to Int96");
+}
+
+template <>
+void DictEncoderImpl<Int96Type>::PutDictionary(const ::arrow::Array& values) {
+  ParquetException::NYI("Direct put to Int96");
+}
+
+template <typename DType>
+void DictEncoderImpl<DType>::Put(const ::arrow::Array& values) {
+  using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
+  const auto& data = checked_cast<const ArrayType&>(values);
+  if (data.null_count() == 0) {
+    // no nulls, just dump the data
+    for (int64_t i = 0; i < data.length(); i++) {
+      Put(data.Value(i));
+    }
+  } else {
+    for (int64_t i = 0; i < data.length(); i++) {
+      if (data.IsValid(i)) {
+        Put(data.Value(i));
+      }
+    }
+  }
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::Put(const ::arrow::Array& values) {
+  AssertFixedSizeBinary(values, type_length_);
+  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+  if (data.null_count() == 0) {
+    // no nulls, just dump the data
+    for (int64_t i = 0; i < data.length(); i++) {
+      Put(FixedLenByteArray(data.Value(i)));
+    }
+  } else {
+    std::vector<uint8_t> empty(type_length_, 0);
+    for (int64_t i = 0; i < data.length(); i++) {
+      if (data.IsValid(i)) {
+        Put(FixedLenByteArray(data.Value(i)));
+      }
+    }
+  }
+}
+
+template <>
+void DictEncoderImpl<ByteArrayType>::Put(const ::arrow::Array& values) {
+  AssertBaseBinary(values);
+  if (::arrow::is_binary_like(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+  } else {
+    DCHECK(::arrow::is_large_binary_like(values.type_id()));
+    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+  }
+}
+
+template <typename DType>
+void AssertCanPutDictionary(DictEncoderImpl<DType>* encoder, const ::arrow::Array& dict) {
+  if (dict.null_count() > 0) {
+    throw ParquetException("Inserted dictionary cannot contain nulls");
+  }
+
+  if (encoder->num_entries() > 0) {
+    throw ParquetException("Can only call PutDictionary on an empty DictEncoder");
+  }
+}
+
+template <typename DType>
+void DictEncoderImpl<DType>::PutDictionary(const ::arrow::Array& values) {
+  AssertCanPutDictionary(this, values);
+
+  using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
+  const auto& data = checked_cast<const ArrayType&>(values);
+
+  dict_encoded_size_ += static_cast<int>(sizeof(typename DType::c_type) * data.length());
+  for (int64_t i = 0; i < data.length(); i++) {
+    int32_t unused_memo_index;
+    PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(data.Value(i), &unused_memo_index));
+  }
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::PutDictionary(const ::arrow::Array& values) {
+  AssertFixedSizeBinary(values, type_length_);
+  AssertCanPutDictionary(this, values);
+
+  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+
+  dict_encoded_size_ += static_cast<int>(type_length_ * data.length());
+  for (int64_t i = 0; i < data.length(); i++) {
+    int32_t unused_memo_index;
+    PARQUET_THROW_NOT_OK(
+        memo_table_.GetOrInsert(data.Value(i), type_length_, &unused_memo_index));
+  }
+}
+
+template <>
+void DictEncoderImpl<ByteArrayType>::PutDictionary(const ::arrow::Array& values) {
+  AssertBaseBinary(values);
+  AssertCanPutDictionary(this, values);
+
+  if (::arrow::is_binary_like(values.type_id())) {
+    PutBinaryDictionaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+  } else {
+    DCHECK(::arrow::is_large_binary_like(values.type_id()));
+    PutBinaryDictionaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+  }
+}
+
+// ----------------------------------------------------------------------
+// BYTE_STREAM_SPLIT encoder
+
+// Common base class for all types
+
+template <typename DType>
+class ByteStreamSplitEncoderBase : public EncoderImpl,
+                                   virtual public TypedEncoder<DType> {
+ public:
+  using T = typename DType::c_type;
+  using TypedEncoder<DType>::Put;
+
+  ByteStreamSplitEncoderBase(const ColumnDescriptor* descr, int byte_width,
+                             ::arrow::MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool),
+        sink_{pool},
+        byte_width_(byte_width),
+        num_values_in_buffer_{0} {}
+
+  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
+
+  std::shared_ptr<Buffer> FlushValues() override {
+    if (byte_width_ == 1) {
+      // Special-cased fast path
+      PARQUET_ASSIGN_OR_THROW(auto buf, sink_.Finish());
+      return buf;
+    }
+    auto output_buffer = AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize());
+    uint8_t* output_buffer_raw = output_buffer->mutable_data();
+    const uint8_t* raw_values = sink_.data();
+    ::arrow::util::internal::ByteStreamSplitEncode(
+        raw_values, /*width=*/byte_width_, num_values_in_buffer_, output_buffer_raw);
+    sink_.Reset();
+    num_values_in_buffer_ = 0;
+    return output_buffer;
+  }
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != NULLPTR) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                   this->memory_pool()));
+      T* data = buffer->template mutable_data_as<T>();
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+          src, num_values, valid_bits, valid_bits_offset, data);
+      Put(data, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+ protected:
+  ::arrow::BufferBuilder sink_;
+  // Required because type_length_ is only filled in for FLBA
+  const int byte_width_;
+  int64_t num_values_in_buffer_;
+};
+
+// BYTE_STREAM_SPLIT encoder implementation for FLOAT, DOUBLE, INT32, INT64
+
+template <typename DType>
+class ByteStreamSplitEncoder : public ByteStreamSplitEncoderBase<DType> {
+ public:
+  using T = typename DType::c_type;
+  using ArrowType = typename EncodingTraits<DType>::ArrowType;
+
+  ByteStreamSplitEncoder(const ColumnDescriptor* descr,
+                         ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+      : ByteStreamSplitEncoderBase<DType>(descr,
+                                          /*byte_width=*/static_cast<int>(sizeof(T)),
+                                          pool) {}
+
+  // Inherit Put(const std::vector<T>&...)
+  using TypedEncoder<DType>::Put;
+
+  void Put(const T* buffer, int num_values) override {
+    if (num_values > 0) {
+      PARQUET_THROW_NOT_OK(
+          this->sink_.Append(reinterpret_cast<const uint8_t*>(buffer),
+                             num_values * static_cast<int64_t>(sizeof(T))));
+      this->num_values_in_buffer_ += num_values;
+    }
+  }
+
+  void Put(const ::arrow::Array& values) override {
+    if (values.type_id() != ArrowType::type_id) {
+      throw ParquetException(std::string() + "direct put from " +
+                             values.type()->ToString() + " not supported");
+    }
+    const auto& data = *values.data();
+    this->PutSpaced(data.GetValues<typename ArrowType::c_type>(1),
+                    static_cast<int>(data.length), data.GetValues<uint8_t>(0, 0),
+                    data.offset);
+  }
+};
+
+// BYTE_STREAM_SPLIT encoder implementation for FLBA
+
+template <>
+class ByteStreamSplitEncoder<FLBAType> : public ByteStreamSplitEncoderBase<FLBAType> {
+ public:
+  using DType = FLBAType;
+  using T = FixedLenByteArray;
+  using ArrowType = ::arrow::FixedSizeBinaryArray;
+
+  ByteStreamSplitEncoder(const ColumnDescriptor* descr,
+                         ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+      : ByteStreamSplitEncoderBase<DType>(descr,
+                                          /*byte_width=*/descr->type_length(), pool) {}
+
+  // Inherit Put(const std::vector<T>&...)
+  using TypedEncoder<DType>::Put;
+
+  void Put(const T* buffer, int num_values) override {
+    if (byte_width_ > 0) {
+      const int64_t total_bytes = static_cast<int64_t>(num_values) * byte_width_;
+      PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
+      for (int i = 0; i < num_values; ++i) {
+        // Write the result to the output stream
+        DCHECK(buffer[i].ptr != nullptr) << "Value ptr cannot be NULL";
+        sink_.UnsafeAppend(buffer[i].ptr, byte_width_);
+      }
+    }
+    this->num_values_in_buffer_ += num_values;
+  }
+
+  void Put(const ::arrow::Array& values) override {
+    AssertFixedSizeBinary(values, byte_width_);
+    const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+    if (data.null_count() == 0) {
+      // no nulls, just buffer the data
+      PARQUET_THROW_NOT_OK(sink_.Append(data.raw_values(), data.length() * byte_width_));
+      this->num_values_in_buffer_ += data.length();
+    } else {
+      const int64_t num_values = data.length() - data.null_count();
+      const int64_t total_bytes = num_values * byte_width_;
+      PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
+      // TODO use VisitSetBitRunsVoid
+      for (int64_t i = 0; i < data.length(); i++) {
+        if (data.IsValid(i)) {
+          sink_.UnsafeAppend(data.Value(i), byte_width_);
+        }
+      }
+      this->num_values_in_buffer_ += num_values;
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
+// DELTA_BINARY_PACKED encoder
+
+/// DeltaBitPackEncoder is an encoder for the DeltaBinary Packing format
+/// as per the parquet spec. See:
+/// https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5
+///
+/// Consists of a header followed by blocks of delta encoded values binary packed.
+///
+///  Format
+///    [header] [block 1] [block 2] ... [block N]
+///
+///  Header
+///    [block size] [number of mini blocks per block] [total value count] [first value]
+///
+///  Block
+///    [min delta] [list of bitwidths of the mini blocks] [miniblocks]
+///
+/// Sets aside bytes at the start of the internal buffer where the header will be written,
+/// and only writes the header when FlushValues is called before returning it.
+///
+/// To encode a block, we will:
+///
+/// 1. Compute the differences between consecutive elements. For the first element in the
+/// block, use the last element in the previous block or, in the case of the first block,
+/// use the first value of the whole sequence, stored in the header.
+///
+/// 2. Compute the frame of reference (the minimum of the deltas in the block). Subtract
+/// this min delta from all deltas in the block. This guarantees that all values are
+/// non-negative.
+///
+/// 3. Encode the frame of reference (min delta) as a zigzag ULEB128 int followed by the
+/// bit widths of the mini blocks and the delta values (minus the min delta) bit packed
+/// per mini block.
+///
+/// Supports only INT32 and INT64.
+
+template <typename DType>
+class DeltaBitPackEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+  // Maximum possible header size
+  static constexpr uint32_t kMaxPageHeaderWriterSize = 32;
+  static constexpr uint32_t kValuesPerBlock =
+      std::is_same_v<int32_t, typename DType::c_type> ? 128 : 256;
+  static constexpr uint32_t kMiniBlocksPerBlock = 4;
+
+ public:
+  using T = typename DType::c_type;
+  using UT = std::make_unsigned_t<T>;
+  using TypedEncoder<DType>::Put;
+
+  explicit DeltaBitPackEncoder(const ColumnDescriptor* descr, MemoryPool* pool,
+                               const uint32_t values_per_block = kValuesPerBlock,
+                               const uint32_t mini_blocks_per_block = kMiniBlocksPerBlock)
+      : EncoderImpl(descr, Encoding::DELTA_BINARY_PACKED, pool),
+        values_per_block_(values_per_block),
+        mini_blocks_per_block_(mini_blocks_per_block),
+        values_per_mini_block_(values_per_block / mini_blocks_per_block),
+        deltas_(values_per_block, ::arrow::stl::allocator<T>(pool)),
+        bits_buffer_(
+            AllocateBuffer(pool, (kMiniBlocksPerBlock + values_per_block) * sizeof(T))),
+        sink_(pool),
+        bit_writer_(bits_buffer_->mutable_data(),
+                    static_cast<int>(bits_buffer_->size())) {
+    if (values_per_block_ % 128 != 0) {
+      throw ParquetException(
+          "the number of values in a block must be multiple of 128, but it's " +
+          std::to_string(values_per_block_));
+    }
+    if (values_per_mini_block_ % 32 != 0) {
+      throw ParquetException(
+          "the number of values in a miniblock must be multiple of 32, but it's " +
+          std::to_string(values_per_mini_block_));
+    }
+    if (values_per_block % mini_blocks_per_block != 0) {
+      throw ParquetException(
+          "the number of values per block % number of miniblocks per block must be 0, "
+          "but it's " +
+          std::to_string(values_per_block % mini_blocks_per_block));
+    }
+    // Reserve enough space at the beginning of the buffer for largest possible header.
+    PARQUET_THROW_NOT_OK(sink_.Advance(kMaxPageHeaderWriterSize));
+  }
+
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
+
+  void Put(const ::arrow::Array& values) override;
+
+  void Put(const T* buffer, int num_values) override;
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override;
+
+  void FlushBlock();
+
+ private:
+  const uint32_t values_per_block_;
+  const uint32_t mini_blocks_per_block_;
+  const uint32_t values_per_mini_block_;
+  uint32_t values_current_block_{0};
+  uint32_t total_value_count_{0};
+  T first_value_{0};
+  T current_value_{0};
+  ArrowPoolVector<T> deltas_;
+  std::shared_ptr<ResizableBuffer> bits_buffer_;
+  ::arrow::BufferBuilder sink_;
+  ::arrow::bit_util::BitWriter bit_writer_;
+};
+
+template <typename DType>
+void DeltaBitPackEncoder<DType>::Put(const T* src, int num_values) {
+  if (num_values == 0) {
+    return;
+  }
+
+  int idx = 0;
+  if (total_value_count_ == 0) {
+    current_value_ = src[0];
+    first_value_ = current_value_;
+    idx = 1;
+  }
+  total_value_count_ += num_values;
+
+  while (idx < num_values) {
+    T value = src[idx];
+    // Calculate deltas. The possible overflow is handled by use of unsigned integers
+    // making subtraction operations well-defined and correct even in case of overflow.
+    // Encoded integers will wrap back around on decoding.
+    // See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n
+    deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_);
+    current_value_ = value;
+    idx++;
+    values_current_block_++;
+    if (values_current_block_ == values_per_block_) {
+      FlushBlock();
+    }
+  }
+}
+
+template <typename DType>
+void DeltaBitPackEncoder<DType>::FlushBlock() {
+  if (values_current_block_ == 0) {
+    return;
+  }
+
+  // Calculate the frame of reference for this miniblock. This value will be subtracted
+  // from all deltas to guarantee all deltas are positive for encoding.
+  const T min_delta =
+      *std::min_element(deltas_.begin(), deltas_.begin() + values_current_block_);
+  bit_writer_.PutZigZagVlqInt(min_delta);
+
+  // Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to write
+  // bit widths of miniblocks as they become known during the encoding.
+  uint8_t* bit_width_data = bit_writer_.GetNextBytePtr(mini_blocks_per_block_);
+  DCHECK(bit_width_data != nullptr);
+
+  const uint32_t num_miniblocks =
+      static_cast<uint32_t>(std::ceil(static_cast<double>(values_current_block_) /
+                                      static_cast<double>(values_per_mini_block_)));
+  for (uint32_t i = 0; i < num_miniblocks; i++) {
+    const uint32_t values_current_mini_block =
+        std::min(values_per_mini_block_, values_current_block_);
+
+    const uint32_t start = i * values_per_mini_block_;
+    const T max_delta = *std::max_element(
+        deltas_.begin() + start, deltas_.begin() + start + values_current_mini_block);
+
+    // The minimum number of bits required to write any of values in deltas_ vector.
+    // See overflow comment above.
+    const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits(
+        static_cast<UT>(max_delta) - static_cast<UT>(min_delta));
+
+    for (uint32_t j = start; j < start + values_current_mini_block; j++) {
+      // Convert delta to frame of reference. See overflow comment above.
+      const UT value = static_cast<UT>(deltas_[j]) - static_cast<UT>(min_delta);
+      bit_writer_.PutValue(value, bit_width);
+    }
+    // If there are not enough values to fill the last mini block, we pad the mini block
+    // with zeroes so that its length is the number of values in a full mini block
+    // multiplied by the bit width.
+    for (uint32_t j = values_current_mini_block; j < values_per_mini_block_; j++) {
+      bit_writer_.PutValue(0, bit_width);
+    }
+    values_current_block_ -= values_current_mini_block;
+  }
+
+  // If, in the last block, less than <number of miniblocks in a block> miniblocks are
+  // needed to store the values, the bytes storing the bit widths of the unneeded
+  // miniblocks are still present, their value should be zero, but readers must accept
+  // arbitrary values as well.
+  for (uint32_t i = num_miniblocks; i < mini_blocks_per_block_; i++) {
+    bit_width_data[i] = 0;
+  }
+  DCHECK_EQ(values_current_block_, 0);
+
+  bit_writer_.Flush();
+  PARQUET_THROW_NOT_OK(sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
+  bit_writer_.Clear();
+}
+
+template <typename DType>
+std::shared_ptr<Buffer> DeltaBitPackEncoder<DType>::FlushValues() {
+  if (values_current_block_ > 0) {
+    FlushBlock();
+  }
+  PARQUET_ASSIGN_OR_THROW(auto buffer, sink_.Finish(/*shrink_to_fit=*/true));
+
+  uint8_t header_buffer_[kMaxPageHeaderWriterSize] = {};
+  bit_util::BitWriter header_writer(header_buffer_, sizeof(header_buffer_));
+  if (!header_writer.PutVlqInt(values_per_block_) ||
+      !header_writer.PutVlqInt(mini_blocks_per_block_) ||
+      !header_writer.PutVlqInt(total_value_count_) ||
+      !header_writer.PutZigZagVlqInt(static_cast<T>(first_value_))) {
+    throw ParquetException("header writing error");
+  }
+  header_writer.Flush();
+
+  // We reserved enough space at the beginning of the buffer for largest possible header
+  // and data was written immediately after. We now write the header data immediately
+  // before the end of reserved space.
+  const size_t offset_bytes = kMaxPageHeaderWriterSize - header_writer.bytes_written();
+  std::memcpy(buffer->mutable_data() + offset_bytes, header_buffer_,
+              header_writer.bytes_written());
+
+  // Reset counter of cached values
+  total_value_count_ = 0;
+  // Reserve enough space at the beginning of the buffer for largest possible header.
+  PARQUET_THROW_NOT_OK(sink_.Advance(kMaxPageHeaderWriterSize));
+
+  // Excess bytes at the beginning are sliced off and ignored.
+  return SliceBuffer(buffer, offset_bytes);
+}
+
+template <>
+void DeltaBitPackEncoder<Int32Type>::Put(const ::arrow::Array& values) {
+  const ::arrow::ArrayData& data = *values.data();
+  if (values.type_id() != ::arrow::Type::INT32) {
+    throw ParquetException("Expected Int32TArray, got ", values.type()->ToString());
+  }
+  if (data.length > std::numeric_limits<int32_t>::max()) {
+    throw ParquetException("Array cannot be longer than ",
+                           std::numeric_limits<int32_t>::max());
+  }
+
+  if (values.null_count() == 0) {
+    Put(data.GetValues<int32_t>(1), static_cast<int>(data.length));
+  } else {
+    PutSpaced(data.GetValues<int32_t>(1), static_cast<int>(data.length),
+              data.GetValues<uint8_t>(0, 0), data.offset);
+  }
+}
+
+template <>
+void DeltaBitPackEncoder<Int64Type>::Put(const ::arrow::Array& values) {
+  const ::arrow::ArrayData& data = *values.data();
+  if (values.type_id() != ::arrow::Type::INT64) {
+    throw ParquetException("Expected Int64TArray, got ", values.type()->ToString());
+  }
+  if (data.length > std::numeric_limits<int32_t>::max()) {
+    throw ParquetException("Array cannot be longer than ",
+                           std::numeric_limits<int32_t>::max());
+  }
+  if (values.null_count() == 0) {
+    Put(data.GetValues<int64_t>(1), static_cast<int>(data.length));
+  } else {
+    PutSpaced(data.GetValues<int64_t>(1), static_cast<int>(data.length),
+              data.GetValues<uint8_t>(0, 0), data.offset);
+  }
+}
+
+template <typename DType>
+void DeltaBitPackEncoder<DType>::PutSpaced(const T* src, int num_values,
+                                           const uint8_t* valid_bits,
+                                           int64_t valid_bits_offset) {
+  if (valid_bits != NULLPTR) {
+    PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                 this->memory_pool()));
+    T* data = buffer->template mutable_data_as<T>();
+    int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+        src, num_values, valid_bits, valid_bits_offset, data);
+    Put(data, num_valid_values);
+  } else {
+    Put(src, num_values);
+  }
+}
+
+// ----------------------------------------------------------------------
+// DELTA_LENGTH_BYTE_ARRAY encoder
+
+class DeltaLengthByteArrayEncoder : public EncoderImpl,
+                                    virtual public TypedEncoder<ByteArrayType> {
+ public:
+  explicit DeltaLengthByteArrayEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY,
+                    pool = ::arrow::default_memory_pool()),
+        sink_(pool),
+        length_encoder_(nullptr, pool) {}
+
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  int64_t EstimatedDataEncodedSize() override {
+    return sink_.length() + length_encoder_.EstimatedDataEncodedSize();
+  }
+
+  using TypedEncoder<ByteArrayType>::Put;
+
+  void Put(const ::arrow::Array& values) override;
+
+  void Put(const T* buffer, int num_values) override;
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override;
+
+ protected:
+  template <typename ArrayType>
+  void PutBinaryArray(const ArrayType& array) {
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
+        *array.data(),
+        [&](::std::string_view view) {
+          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+            return Status::Invalid(
+                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
+          }
+          if (ARROW_PREDICT_FALSE(
+                  view.size() + sink_.length() >
+                  static_cast<size_t>(std::numeric_limits<int32_t>::max()))) {
+            return Status::Invalid("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
+          }
+          length_encoder_.Put({static_cast<int32_t>(view.length())}, 1);
+          PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length()));
+          return Status::OK();
+        },
+        []() { return Status::OK(); }));
+  }
+
+  ::arrow::BufferBuilder sink_;
+  DeltaBitPackEncoder<Int32Type> length_encoder_;
+};
+
+void DeltaLengthByteArrayEncoder::Put(const ::arrow::Array& values) {
+  AssertBaseBinary(values);
+  if (::arrow::is_binary_like(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+  } else {
+    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+  }
+}
+
+void DeltaLengthByteArrayEncoder::Put(const T* src, int num_values) {
+  if (num_values == 0) {
+    return;
+  }
+
+  constexpr int kBatchSize = 256;
+  std::array<int32_t, kBatchSize> lengths;
+  uint32_t total_increment_size = 0;
+  for (int idx = 0; idx < num_values; idx += kBatchSize) {
+    const int batch_size = std::min(kBatchSize, num_values - idx);
+    for (int j = 0; j < batch_size; ++j) {
+      const int32_t len = src[idx + j].len;
+      if (ARROW_PREDICT_FALSE(
+              AddWithOverflow(total_increment_size, len, &total_increment_size))) {
+        throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
+      }
+      lengths[j] = len;
+    }
+    length_encoder_.Put(lengths.data(), batch_size);
+  }
+  if (sink_.length() + total_increment_size > std::numeric_limits<int32_t>::max()) {
+    throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
+  }
+  PARQUET_THROW_NOT_OK(sink_.Reserve(total_increment_size));
+  for (int idx = 0; idx < num_values; idx++) {
+    sink_.UnsafeAppend(src[idx].ptr, src[idx].len);
+  }
+}
+
+void DeltaLengthByteArrayEncoder::PutSpaced(const T* src, int num_values,
+                                            const uint8_t* valid_bits,
+                                            int64_t valid_bits_offset) {
+  if (valid_bits != NULLPTR) {
+    PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                 this->memory_pool()));
+    T* data = buffer->template mutable_data_as<T>();
+    int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+        src, num_values, valid_bits, valid_bits_offset, data);
+    Put(data, num_valid_values);
+  } else {
+    Put(src, num_values);
+  }
+}
+
+std::shared_ptr<Buffer> DeltaLengthByteArrayEncoder::FlushValues() {
+  std::shared_ptr<Buffer> encoded_lengths = length_encoder_.FlushValues();
+
+  std::shared_ptr<Buffer> data;
+  PARQUET_THROW_NOT_OK(sink_.Finish(&data));
+  sink_.Reset();
+
+  PARQUET_THROW_NOT_OK(sink_.Resize(encoded_lengths->size() + data->size()));
+  PARQUET_THROW_NOT_OK(sink_.Append(encoded_lengths->data(), encoded_lengths->size()));
+  PARQUET_THROW_NOT_OK(sink_.Append(data->data(), data->size()));
+
+  std::shared_ptr<Buffer> buffer;
+  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true));
+  return buffer;
+}
+
+// ----------------------------------------------------------------------
+// DELTA_BYTE_ARRAY encoder
+
+/// Delta Byte Array encoding also known as incremental encoding or front compression:
+/// for each element in a sequence of strings, store the prefix length of the previous
+/// entry plus the suffix.
+///
+/// This is stored as a sequence of delta-encoded prefix lengths (DELTA_BINARY_PACKED),
+/// followed by the suffixes encoded as delta length byte arrays
+/// (DELTA_LENGTH_BYTE_ARRAY).
+
+template <typename DType>
+class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+  static constexpr std::string_view kEmpty = "";
+
+ public:
+  using T = typename DType::c_type;
+
+  explicit DeltaByteArrayEncoder(const ColumnDescriptor* descr,
+                                 MemoryPool* pool = ::arrow::default_memory_pool())
+      : EncoderImpl(descr, Encoding::DELTA_BYTE_ARRAY, pool),
+        sink_(pool),
+        prefix_length_encoder_(/*descr=*/nullptr, pool),
+        suffix_encoder_(descr, pool),
+        last_value_(""),
+        empty_(static_cast<uint32_t>(kEmpty.size()),
+               reinterpret_cast<const uint8_t*>(kEmpty.data())) {}
+
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  int64_t EstimatedDataEncodedSize() override {
+    return prefix_length_encoder_.EstimatedDataEncodedSize() +
+           suffix_encoder_.EstimatedDataEncodedSize();
+  }
+
+  using TypedEncoder<DType>::Put;
+
+  void Put(const ::arrow::Array& values) override;
+
+  void Put(const T* buffer, int num_values) override;
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != nullptr) {
+      if (buffer_ == nullptr) {
+        PARQUET_ASSIGN_OR_THROW(buffer_,
+                                ::arrow::AllocateResizableBuffer(num_values * sizeof(T),
+                                                                 this->memory_pool()));
+      } else {
+        PARQUET_THROW_NOT_OK(buffer_->Resize(num_values * sizeof(T), false));
+      }
+      T* data = buffer_->mutable_data_as<T>();
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+          src, num_values, valid_bits, valid_bits_offset, data);
+      Put(data, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+ protected:
+  template <typename VisitorType>
+  void PutInternal(const T* src, int num_values, const VisitorType visitor) {
+    if (num_values == 0) {
+      return;
+    }
+
+    std::string_view last_value_view = last_value_;
+    constexpr int kBatchSize = 256;
+    std::array<int32_t, kBatchSize> prefix_lengths;
+    std::array<ByteArray, kBatchSize> suffixes;
+
+    for (int i = 0; i < num_values; i += kBatchSize) {
+      const int batch_size = std::min(kBatchSize, num_values - i);
+
+      for (int j = 0; j < batch_size; ++j) {
+        const int idx = i + j;
+        const auto view = visitor[idx];
+        const auto len = static_cast<const uint32_t>(view.length());
+
+        uint32_t common_prefix_length = 0;
+        const uint32_t maximum_common_prefix_length =
+            std::min(len, static_cast<uint32_t>(last_value_view.length()));
+        while (common_prefix_length < maximum_common_prefix_length) {
+          if (last_value_view[common_prefix_length] != view[common_prefix_length]) {
+            break;
+          }
+          common_prefix_length++;
+        }
+
+        last_value_view = view;
+        prefix_lengths[j] = common_prefix_length;
+        const uint32_t suffix_length = len - common_prefix_length;
+        const uint8_t* suffix_ptr = src[idx].ptr + common_prefix_length;
+
+        // Convert to ByteArray, so it can be passed to the suffix_encoder_.
+        const ByteArray suffix(suffix_length, suffix_ptr);
+        suffixes[j] = suffix;
+      }
+      suffix_encoder_.Put(suffixes.data(), batch_size);
+      prefix_length_encoder_.Put(prefix_lengths.data(), batch_size);
+    }
+    last_value_ = last_value_view;
+  }
+
+  template <typename ArrayType>
+  void PutBinaryArray(const ArrayType& array) {
+    auto previous_len = static_cast<uint32_t>(last_value_.length());
+    std::string_view last_value_view = last_value_;
+
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
+        *array.data(),
+        [&](::std::string_view view) {
+          if (ARROW_PREDICT_FALSE(view.size() >= kMaxByteArraySize)) {
+            return Status::Invalid(
+                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
+          }
+          const ByteArray src{view};
+
+          uint32_t common_prefix_length = 0;
+          const uint32_t len = src.len;
+          const uint32_t maximum_common_prefix_length = std::min(previous_len, len);
+          while (common_prefix_length < maximum_common_prefix_length) {
+            if (last_value_view[common_prefix_length] != view[common_prefix_length]) {
+              break;
+            }
+            common_prefix_length++;
+          }
+          previous_len = len;
+          prefix_length_encoder_.Put({static_cast<int32_t>(common_prefix_length)}, 1);
+
+          last_value_view = view;
+          const auto suffix_length = static_cast<uint32_t>(len - common_prefix_length);
+          if (suffix_length == 0) {
+            suffix_encoder_.Put(&empty_, 1);
+            return Status::OK();
+          }
+          const uint8_t* suffix_ptr = src.ptr + common_prefix_length;
+          // Convert to ByteArray, so it can be passed to the suffix_encoder_.
+          const ByteArray suffix(suffix_length, suffix_ptr);
+          suffix_encoder_.Put(&suffix, 1);
+
+          return Status::OK();
+        },
+        []() { return Status::OK(); }));
+    last_value_ = last_value_view;
+  }
+
+  ::arrow::BufferBuilder sink_;
+  DeltaBitPackEncoder<Int32Type> prefix_length_encoder_;
+  DeltaLengthByteArrayEncoder suffix_encoder_;
+  std::string last_value_;
+  const ByteArray empty_;
+  std::unique_ptr<ResizableBuffer> buffer_;
+};
+
+struct ByteArrayVisitor {
+  const ByteArray* src;
+
+  std::string_view operator[](int i) const {
+    if (ARROW_PREDICT_FALSE(src[i].len >= kMaxByteArraySize)) {
+      throw ParquetException("Parquet cannot store strings with size 2GB or more, got: ",
+                             src[i].len);
+    }
+    return std::string_view{src[i]};
+  }
+
+  uint32_t len(int i) const { return src[i].len; }
+};
+
+struct FLBAVisitor {
+  const FLBA* src;
+  const uint32_t type_length;
+
+  std::string_view operator[](int i) const {
+    return std::string_view{reinterpret_cast<const char*>(src[i].ptr), type_length};
+  }
+
+  uint32_t len(int i) const { return type_length; }
+};
+
+template <>
+void DeltaByteArrayEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
+  auto visitor = ByteArrayVisitor{src};
+  PutInternal<ByteArrayVisitor>(src, num_values, visitor);
+}
+
+template <>
+void DeltaByteArrayEncoder<FLBAType>::Put(const FLBA* src, int num_values) {
+  auto visitor = FLBAVisitor{src, static_cast<uint32_t>(descr_->type_length())};
+  PutInternal<FLBAVisitor>(src, num_values, visitor);
+}
+
+template <typename DType>
+void DeltaByteArrayEncoder<DType>::Put(const ::arrow::Array& values) {
+  if (::arrow::is_binary_like(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+  } else if (::arrow::is_large_binary_like(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+  } else if (::arrow::is_fixed_size_binary(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::FixedSizeBinaryArray&>(values));
+  } else {
+    throw ParquetException("Only BaseBinaryArray and subclasses supported");
+  }
+}
+
+template <typename DType>
+std::shared_ptr<Buffer> DeltaByteArrayEncoder<DType>::FlushValues() {
+  PARQUET_THROW_NOT_OK(sink_.Resize(EstimatedDataEncodedSize(), false));
+
+  std::shared_ptr<Buffer> prefix_lengths = prefix_length_encoder_.FlushValues();
+  PARQUET_THROW_NOT_OK(sink_.Append(prefix_lengths->data(), prefix_lengths->size()));
+
+  std::shared_ptr<Buffer> suffixes = suffix_encoder_.FlushValues();
+  PARQUET_THROW_NOT_OK(sink_.Append(suffixes->data(), suffixes->size()));
+
+  std::shared_ptr<Buffer> buffer;
+  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true));
+  last_value_.clear();
+  return buffer;
+}
+
+// ----------------------------------------------------------------------
+// RLE encoder for BOOLEAN
+
+class RleBooleanEncoder final : public EncoderImpl, virtual public BooleanEncoder {
+ public:
+  explicit RleBooleanEncoder(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::RLE, pool),
+        buffered_append_values_(::arrow::stl::allocator<T>(pool)) {}
+
+  int64_t EstimatedDataEncodedSize() override {
+    return kRleLengthInBytes + MaxRleBufferSize();
+  }
+
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  void Put(const T* buffer, int num_values) override;
+  void Put(const ::arrow::Array& values) override {
+    if (values.type_id() != ::arrow::Type::BOOL) {
+      throw ParquetException("RleBooleanEncoder expects BooleanArray, got ",
+                             values.type()->ToString());
+    }
+    const auto& boolean_array = checked_cast<const ::arrow::BooleanArray&>(values);
+    if (values.null_count() == 0) {
+      for (int i = 0; i < boolean_array.length(); ++i) {
+        // null_count == 0, so just call Value directly is ok.
+        buffered_append_values_.push_back(boolean_array.Value(i));
+      }
+    } else {
+      PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::BooleanType>(
+          *boolean_array.data(),
+          [&](bool value) {
+            buffered_append_values_.push_back(value);
+            return Status::OK();
+          },
+          []() { return Status::OK(); }));
+    }
+  }
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != NULLPTR) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                   this->memory_pool()));
+      T* data = buffer->mutable_data_as<T>();
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+          src, num_values, valid_bits, valid_bits_offset, data);
+      Put(data, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+  void Put(const std::vector<bool>& src, int num_values) override;
+
+ protected:
+  template <typename SequenceType>
+  void PutImpl(const SequenceType& src, int num_values);
+
+  int MaxRleBufferSize() const noexcept {
+    return RlePreserveBufferSize(static_cast<int>(buffered_append_values_.size()),
+                                 kBitWidth);
+  }
+
+  constexpr static int32_t kBitWidth = 1;
+  /// 4 bytes in little-endian, which indicates the length.
+  constexpr static int32_t kRleLengthInBytes = 4;
+
+  // std::vector<bool> in C++ is tricky, because it's a bitmap.
+  // Here RleBooleanEncoder will only append values into it, and
+  // dump values into Buffer, so using it here is ok.
+  ArrowPoolVector<bool> buffered_append_values_;
+};
+
+void RleBooleanEncoder::Put(const bool* src, int num_values) { PutImpl(src, num_values); }
+
+void RleBooleanEncoder::Put(const std::vector<bool>& src, int num_values) {
+  PutImpl(src, num_values);
+}
+
+template <typename SequenceType>
+void RleBooleanEncoder::PutImpl(const SequenceType& src, int num_values) {
+  for (int i = 0; i < num_values; ++i) {
+    buffered_append_values_.push_back(src[i]);
+  }
+}
+
+std::shared_ptr<Buffer> RleBooleanEncoder::FlushValues() {
+  int rle_buffer_size_max = MaxRleBufferSize();
+  std::shared_ptr<ResizableBuffer> buffer =
+      AllocateBuffer(this->pool_, rle_buffer_size_max + kRleLengthInBytes);
+  ::arrow::util::RleEncoder encoder(buffer->mutable_data() + kRleLengthInBytes,
+                                    rle_buffer_size_max, /*bit_width*/ kBitWidth);
+
+  for (bool value : buffered_append_values_) {
+    encoder.Put(value ? 1 : 0);
+  }
+  encoder.Flush();
+  ::arrow::util::SafeStore(buffer->mutable_data(),
+                           ::arrow::bit_util::ToLittleEndian(encoder.len()));
+  PARQUET_THROW_NOT_OK(buffer->Resize(kRleLengthInBytes + encoder.len()));
+  buffered_append_values_.clear();
+  return buffer;
+}
+
+}  // namespace
+
+// ----------------------------------------------------------------------
+// Factory function
+
+std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encoding,
+                                     bool use_dictionary, const ColumnDescriptor* descr,
+                                     MemoryPool* pool) {
+  if (use_dictionary) {
+    switch (type_num) {
+      case Type::INT32:
+        return std::make_unique<DictEncoderImpl<Int32Type>>(descr, pool);
+      case Type::INT64:
+        return std::make_unique<DictEncoderImpl<Int64Type>>(descr, pool);
+      case Type::INT96:
+        return std::make_unique<DictEncoderImpl<Int96Type>>(descr, pool);
+      case Type::FLOAT:
+        return std::make_unique<DictEncoderImpl<FloatType>>(descr, pool);
+      case Type::DOUBLE:
+        return std::make_unique<DictEncoderImpl<DoubleType>>(descr, pool);
+      case Type::BYTE_ARRAY:
+        return std::make_unique<DictEncoderImpl<ByteArrayType>>(descr, pool);
+      case Type::FIXED_LEN_BYTE_ARRAY:
+        return std::make_unique<DictEncoderImpl<FLBAType>>(descr, pool);
+      default:
+        DCHECK(false) << "Encoder not implemented";
+        break;
+    }
+  } else if (encoding == Encoding::PLAIN) {
+    switch (type_num) {
+      case Type::BOOLEAN:
+        return std::make_unique<PlainEncoder<BooleanType>>(descr, pool);
+      case Type::INT32:
+        return std::make_unique<PlainEncoder<Int32Type>>(descr, pool);
+      case Type::INT64:
+        return std::make_unique<PlainEncoder<Int64Type>>(descr, pool);
+      case Type::INT96:
+        return std::make_unique<PlainEncoder<Int96Type>>(descr, pool);
+      case Type::FLOAT:
+        return std::make_unique<PlainEncoder<FloatType>>(descr, pool);
+      case Type::DOUBLE:
+        return std::make_unique<PlainEncoder<DoubleType>>(descr, pool);
+      case Type::BYTE_ARRAY:
+        return std::make_unique<PlainEncoder<ByteArrayType>>(descr, pool);
+      case Type::FIXED_LEN_BYTE_ARRAY:
+        return std::make_unique<PlainEncoder<FLBAType>>(descr, pool);
+      default:
+        DCHECK(false) << "Encoder not implemented";
+        break;
+    }
+  } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
+    switch (type_num) {
+      case Type::INT32:
+        return std::make_unique<ByteStreamSplitEncoder<Int32Type>>(descr, pool);
+      case Type::INT64:
+        return std::make_unique<ByteStreamSplitEncoder<Int64Type>>(descr, pool);
+      case Type::FLOAT:
+        return std::make_unique<ByteStreamSplitEncoder<FloatType>>(descr, pool);
+      case Type::DOUBLE:
+        return std::make_unique<ByteStreamSplitEncoder<DoubleType>>(descr, pool);
+      case Type::FIXED_LEN_BYTE_ARRAY:
+        return std::make_unique<ByteStreamSplitEncoder<FLBAType>>(descr, pool);
+      default:
+        throw ParquetException(
+            "BYTE_STREAM_SPLIT only supports FLOAT, DOUBLE, INT32, INT64 "
+            "and FIXED_LEN_BYTE_ARRAY");
+    }
+  } else if (encoding == Encoding::DELTA_BINARY_PACKED) {
+    switch (type_num) {
+      case Type::INT32:
+        return std::make_unique<DeltaBitPackEncoder<Int32Type>>(descr, pool);
+      case Type::INT64:
+        return std::make_unique<DeltaBitPackEncoder<Int64Type>>(descr, pool);
+      default:
+        throw ParquetException(
+            "DELTA_BINARY_PACKED encoder only supports INT32 and INT64");
+    }
+  } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) {
+    switch (type_num) {
+      case Type::BYTE_ARRAY:
+        return std::make_unique<DeltaLengthByteArrayEncoder>(descr, pool);
+      default:
+        throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY");
+    }
+  } else if (encoding == Encoding::RLE) {
+    switch (type_num) {
+      case Type::BOOLEAN:
+        return std::make_unique<RleBooleanEncoder>(descr, pool);
+      default:
+        throw ParquetException("RLE only supports BOOLEAN");
+    }
+  } else if (encoding == Encoding::DELTA_BYTE_ARRAY) {
+    switch (type_num) {
+      case Type::BYTE_ARRAY:
+        return std::make_unique<DeltaByteArrayEncoder<ByteArrayType>>(descr, pool);
+      case Type::FIXED_LEN_BYTE_ARRAY:
+        return std::make_unique<DeltaByteArrayEncoder<FLBAType>>(descr, pool);
+      default:
+        throw ParquetException(
+            "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY");
+    }
+  } else {
+    ParquetException::NYI("Selected encoding is not supported");
+  }
+  DCHECK(false) << "Should not be able to reach this code";
+  return nullptr;
+}
+
+}  // namespace parquet
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 493c4044ddc..5717886f107 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -22,31 +22,16 @@
 #include <memory>
 #include <vector>
 
-#include "arrow/util/spaced.h"
+#include "arrow/type_fwd.h"
 
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/types.h"
 
 namespace arrow {
-
-class Array;
-class ArrayBuilder;
-class BinaryArray;
-class BinaryBuilder;
-class BooleanBuilder;
-class Int32Type;
-class Int64Type;
-class FloatType;
-class DoubleType;
-class FixedSizeBinaryType;
-template <typename T>
-class NumericBuilder;
-class FixedSizeBinaryBuilder;
 template <typename T>
 class Dictionary32Builder;
-
-}  // namespace arrow
+}
 
 namespace parquet {
 
@@ -184,7 +169,7 @@ class Encoder {
 template <typename DType>
 class TypedEncoder : virtual public Encoder {
  public:
-  typedef typename DType::c_type T;
+  using T = typename DType::c_type;
 
   using Encoder::Put;
 
@@ -293,20 +278,7 @@ class TypedDecoder : virtual public Decoder {
   /// \param[in] valid_bits_offset offset into valid_bits
   /// \return The number of values decoded, including nulls.
   virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
-                           const uint8_t* valid_bits, int64_t valid_bits_offset) {
-    if (null_count > 0) {
-      int values_to_read = num_values - null_count;
-      int values_read = Decode(buffer, values_to_read);
-      if (values_read != values_to_read) {
-        throw ParquetException("Number of values / definition_levels read did not match");
-      }
-
-      return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
-                                                      valid_bits, valid_bits_offset);
-    } else {
-      return Decode(buffer, num_values);
-    }
-  }
+                           const uint8_t* valid_bits, int64_t valid_bits_offset) = 0;
 
   /// \brief Decode into an ArrayBuilder or other accumulator
   ///

From 9761241bc831b7f421558622410a39fe4f9aa563 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 5 Sep 2024 22:34:46 +0200
Subject: [PATCH 143/157] MINOR: [CI][C++] Add C++ example builds to "cpp"
 Crossbow task group (#43975)

### Rationale for this change

The `python` task group already includes the Python example builds. This PR does the same for the `cpp` task group.

### Are these changes tested?

By CI itself.

### Are there any user-facing changes?

No.

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/tasks/tasks.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index b7e0c1601e3..9ded6ee41ab 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -75,6 +75,7 @@ groups:
 
   cpp:
     - test-*cpp*
+    - example-*cpp*
 
   c-glib:
     - test-*c-glib*

From 262d6f6f68814b6495b87a13cfba5fd9bf6c7d67 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 6 Sep 2024 05:41:46 +0900
Subject: [PATCH 144/157] GH-43944: [C++][Parquet] Add support for
 arrow::ArrayStatistics: non zero-copy int based types (#43945)

### Rationale for this change

Statistics is useful for fast processing.

Target types:

* `UInt8`
* `Int8`
* `UInt16`
* `Int16`
* `UInt32`
* `UInt64`
* `Date32`
* `Time32`
* `Time64`
* `Duration`

### What changes are included in this PR?

Map `ColumnChunkMetaData` information to `arrow::ArrayStatistics`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.

* GitHub Issue: #43944

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .../parquet/arrow/arrow_statistics_test.cc    | 105 ++++++++++++++++++
 cpp/src/parquet/arrow/reader.cc               |   5 +-
 cpp/src/parquet/arrow/reader_internal.cc      |  78 +++++++++----
 cpp/src/parquet/arrow/reader_internal.h       |  28 +++--
 4 files changed, 187 insertions(+), 29 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc
index a19303c3dc0..2638358f1ce 100644
--- a/cpp/src/parquet/arrow/arrow_statistics_test.cc
+++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc
@@ -17,12 +17,14 @@
 
 #include "gtest/gtest.h"
 
+#include "arrow/array.h"
 #include "arrow/table.h"
 #include "arrow/testing/gtest_util.h"
 
 #include "parquet/api/reader.h"
 #include "parquet/api/writer.h"
 
+#include "parquet/arrow/reader.h"
 #include "parquet/arrow/schema.h"
 #include "parquet/arrow/writer.h"
 #include "parquet/file_writer.h"
@@ -179,4 +181,107 @@ TEST(StatisticsTest, TruncateOnlyHalfMinMax) {
   ASSERT_FALSE(stats->HasMinMax());
 }
 
+namespace {
+::arrow::Result<std::shared_ptr<::arrow::Array>> StatisticsReadArray(
+    std::shared_ptr<::arrow::DataType> data_type, const std::string& json) {
+  auto schema = ::arrow::schema({::arrow::field("column", data_type)});
+  auto array = ::arrow::ArrayFromJSON(data_type, json);
+  auto record_batch = ::arrow::RecordBatch::Make(schema, array->length(), {array});
+  ARROW_ASSIGN_OR_RAISE(auto sink, ::arrow::io::BufferOutputStream::Create());
+  const auto arrow_writer_properties =
+      parquet::ArrowWriterProperties::Builder().store_schema()->build();
+  ARROW_ASSIGN_OR_RAISE(
+      auto writer,
+      FileWriter::Open(*schema, ::arrow::default_memory_pool(), sink,
+                       default_writer_properties(), arrow_writer_properties));
+  ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*record_batch));
+  ARROW_RETURN_NOT_OK(writer->Close());
+  ARROW_ASSIGN_OR_RAISE(auto buffer, sink->Finish());
+
+  auto reader =
+      ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
+  std::unique_ptr<FileReader> file_reader;
+  ARROW_RETURN_NOT_OK(
+      FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader));
+  std::shared_ptr<::arrow::ChunkedArray> chunked_array;
+  ARROW_RETURN_NOT_OK(file_reader->ReadColumn(0, &chunked_array));
+  return chunked_array->chunk(0);
+}
+
+template <typename ArrowType, typename MinMaxType>
+void TestStatisticsReadArray(std::shared_ptr<::arrow::DataType> arrow_type) {
+  using ArrowArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+  using ArrowCType = typename ArrowType::c_type;
+  constexpr auto min = std::numeric_limits<ArrowCType>::min();
+  constexpr auto max = std::numeric_limits<ArrowCType>::max();
+
+  std::string json;
+  json += "[";
+  json += std::to_string(max);
+  json += ", null, ";
+  json += std::to_string(min);
+  json += ", ";
+  json += std::to_string(max);
+  json += "]";
+  ASSERT_OK_AND_ASSIGN(auto array, StatisticsReadArray(arrow_type, json));
+  auto typed_array = std::static_pointer_cast<ArrowArrayType>(array);
+  auto statistics = typed_array->statistics();
+  ASSERT_NE(nullptr, statistics);
+  ASSERT_EQ(true, statistics->null_count.has_value());
+  ASSERT_EQ(1, statistics->null_count.value());
+  ASSERT_EQ(false, statistics->distinct_count.has_value());
+  ASSERT_EQ(true, statistics->min.has_value());
+  ASSERT_EQ(true, std::holds_alternative<MinMaxType>(*statistics->min));
+  ASSERT_EQ(min, std::get<MinMaxType>(*statistics->min));
+  ASSERT_EQ(true, statistics->is_min_exact);
+  ASSERT_EQ(true, statistics->max.has_value());
+  ASSERT_EQ(true, std::holds_alternative<MinMaxType>(*statistics->max));
+  ASSERT_EQ(max, std::get<MinMaxType>(*statistics->max));
+  ASSERT_EQ(true, statistics->is_min_exact);
+}
+}  // namespace
+
+TEST(TestStatisticsRead, Int8) {
+  TestStatisticsReadArray<::arrow::Int8Type, int64_t>(::arrow::int8());
+}
+
+TEST(TestStatisticsRead, UInt8) {
+  TestStatisticsReadArray<::arrow::UInt8Type, uint64_t>(::arrow::uint8());
+}
+
+TEST(TestStatisticsRead, Int16) {
+  TestStatisticsReadArray<::arrow::Int16Type, int64_t>(::arrow::int16());
+}
+
+TEST(TestStatisticsRead, UInt16) {
+  TestStatisticsReadArray<::arrow::UInt16Type, uint64_t>(::arrow::uint16());
+}
+
+TEST(TestStatisticsRead, UInt32) {
+  TestStatisticsReadArray<::arrow::UInt32Type, uint64_t>(::arrow::uint32());
+}
+
+TEST(TestStatisticsRead, UInt64) {
+  TestStatisticsReadArray<::arrow::UInt64Type, uint64_t>(::arrow::uint64());
+}
+
+TEST(TestStatisticsRead, Date32) {
+  TestStatisticsReadArray<::arrow::Date32Type, int64_t>(::arrow::date32());
+}
+
+TEST(TestStatisticsRead, Time32) {
+  TestStatisticsReadArray<::arrow::Time32Type, int64_t>(
+      ::arrow::time32(::arrow::TimeUnit::MILLI));
+}
+
+TEST(TestStatisticsRead, Time64) {
+  TestStatisticsReadArray<::arrow::Time64Type, int64_t>(
+      ::arrow::time64(::arrow::TimeUnit::MICRO));
+}
+
+TEST(TestStatisticsRead, Duration) {
+  TestStatisticsReadArray<::arrow::DurationType, int64_t>(
+      ::arrow::duration(::arrow::TimeUnit::NANO));
+}
+
 }  // namespace parquet::arrow
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 285e2a59738..4f57c3f4f56 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -485,8 +485,9 @@ class LeafReader : public ColumnReaderImpl {
         NextRowGroup();
       }
     }
-    RETURN_NOT_OK(
-        TransferColumnData(record_reader_.get(), field_, descr_, ctx_->pool, &out_));
+    RETURN_NOT_OK(TransferColumnData(record_reader_.get(),
+                                     input_->column_chunk_metadata(), field_, descr_,
+                                     ctx_.get(), &out_));
     return Status::OK();
     END_PARQUET_CATCH_EXCEPTIONS
   }
diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index e5aef5a45b5..e6c2d95e1fb 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -319,26 +319,59 @@ void ReconstructChunksWithoutNulls(::arrow::ArrayVector* chunks) {
 }
 
 template <typename ArrowType, typename ParquetType>
-Status TransferInt(RecordReader* reader, MemoryPool* pool,
-                   const std::shared_ptr<Field>& field, Datum* out) {
+Status TransferInt(RecordReader* reader,
+                   std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
+                   const ReaderContext* ctx, const std::shared_ptr<Field>& field,
+                   Datum* out) {
   using ArrowCType = typename ArrowType::c_type;
   using ParquetCType = typename ParquetType::c_type;
   int64_t length = reader->values_written();
   ARROW_ASSIGN_OR_RAISE(auto data,
-                        ::arrow::AllocateBuffer(length * sizeof(ArrowCType), pool));
+                        ::arrow::AllocateBuffer(length * sizeof(ArrowCType), ctx->pool));
 
   auto values = reinterpret_cast<const ParquetCType*>(reader->values());
   auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data());
   std::copy(values, values + length, out_ptr);
+  int64_t null_count = 0;
+  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, std::move(data)};
   if (field->nullable()) {
-    *out = std::make_shared<ArrayType<ArrowType>>(field->type(), length, std::move(data),
-                                                  reader->ReleaseIsValid(),
-                                                  reader->null_count());
-  } else {
-    *out =
-        std::make_shared<ArrayType<ArrowType>>(field->type(), length, std::move(data),
-                                               /*null_bitmap=*/nullptr, /*null_count=*/0);
+    null_count = reader->null_count();
+    buffers[0] = reader->ReleaseIsValid();
+  }
+  auto array_data =
+      ::arrow::ArrayData::Make(field->type(), length, std::move(buffers), null_count);
+  auto array_statistics = std::make_shared<::arrow::ArrayStatistics>();
+  array_statistics->null_count = null_count;
+  auto statistics = metadata->statistics().get();
+  if (statistics) {
+    if (statistics->HasDistinctCount()) {
+      array_statistics->distinct_count = statistics->distinct_count();
+    }
+    if (statistics->HasMinMax()) {
+      auto typed_statistics =
+          static_cast<::parquet::TypedStatistics<ParquetType>*>(statistics);
+      const ArrowCType min = typed_statistics->min();
+      const ArrowCType max = typed_statistics->max();
+      if (std::is_signed<ArrowCType>::value) {
+        array_statistics->min = static_cast<int64_t>(min);
+        array_statistics->max = static_cast<int64_t>(max);
+      } else {
+        array_statistics->min = static_cast<uint64_t>(min);
+        array_statistics->max = static_cast<uint64_t>(max);
+      }
+      // We can assume that integer based min/max are always exact if
+      // they exist. Apache Parquet's "Statistics" has
+      // "is_min_value_exact" and "is_max_value_exact" but we can
+      // ignore them for integer based min/max.
+      //
+      // See also the discussion at dev@parquet.apache.org:
+      // https://lists.apache.org/thread/zfnmg5p51b7oylft5w5k4670wgkd4zv4
+      array_statistics->is_min_exact = true;
+      array_statistics->is_max_exact = true;
+    }
   }
+  array_data->statistics = std::move(array_statistics);
+  *out = std::make_shared<ArrayType<ArrowType>>(std::move(array_data));
   return Status::OK();
 }
 
@@ -728,21 +761,26 @@ Status TransferHalfFloat(RecordReader* reader, MemoryPool* pool,
 
 }  // namespace
 
-#define TRANSFER_INT32(ENUM, ArrowType)                                               \
-  case ::arrow::Type::ENUM: {                                                         \
-    Status s = TransferInt<ArrowType, Int32Type>(reader, pool, value_field, &result); \
-    RETURN_NOT_OK(s);                                                                 \
+#define TRANSFER_INT32(ENUM, ArrowType)                                            \
+  case ::arrow::Type::ENUM: {                                                      \
+    Status s = TransferInt<ArrowType, Int32Type>(reader, std::move(metadata), ctx, \
+                                                 value_field, &result);            \
+    RETURN_NOT_OK(s);                                                              \
   } break;
 
-#define TRANSFER_INT64(ENUM, ArrowType)                                               \
-  case ::arrow::Type::ENUM: {                                                         \
-    Status s = TransferInt<ArrowType, Int64Type>(reader, pool, value_field, &result); \
-    RETURN_NOT_OK(s);                                                                 \
+#define TRANSFER_INT64(ENUM, ArrowType)                                            \
+  case ::arrow::Type::ENUM: {                                                      \
+    Status s = TransferInt<ArrowType, Int64Type>(reader, std::move(metadata), ctx, \
+                                                 value_field, &result);            \
+    RETURN_NOT_OK(s);                                                              \
   } break;
 
-Status TransferColumnData(RecordReader* reader, const std::shared_ptr<Field>& value_field,
-                          const ColumnDescriptor* descr, MemoryPool* pool,
+Status TransferColumnData(RecordReader* reader,
+                          std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
+                          const std::shared_ptr<Field>& value_field,
+                          const ColumnDescriptor* descr, const ReaderContext* ctx,
                           std::shared_ptr<ChunkedArray>* out) {
+  auto pool = ctx->pool;
   Datum result;
   std::shared_ptr<ChunkedArray> chunked_result;
   switch (value_field->type()->id()) {
diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h
index cf9dbb86577..fab56c88804 100644
--- a/cpp/src/parquet/arrow/reader_internal.h
+++ b/cpp/src/parquet/arrow/reader_internal.h
@@ -66,7 +66,8 @@ class FileColumnIterator {
       : column_index_(column_index),
         reader_(reader),
         schema_(reader->metadata()->schema()),
-        row_groups_(row_groups.begin(), row_groups.end()) {}
+        row_groups_(row_groups.begin(), row_groups.end()),
+        row_group_index_(-1) {}
 
   virtual ~FileColumnIterator() {}
 
@@ -75,7 +76,8 @@ class FileColumnIterator {
       return nullptr;
     }
 
-    auto row_group_reader = reader_->RowGroup(row_groups_.front());
+    row_group_index_ = row_groups_.front();
+    auto row_group_reader = reader_->RowGroup(row_group_index_);
     row_groups_.pop_front();
     return row_group_reader->GetColumnPageReader(column_index_);
   }
@@ -86,23 +88,29 @@ class FileColumnIterator {
 
   std::shared_ptr<FileMetaData> metadata() const { return reader_->metadata(); }
 
+  std::unique_ptr<RowGroupMetaData> row_group_metadata() const {
+    return metadata()->RowGroup(row_group_index_);
+  }
+
+  std::unique_ptr<ColumnChunkMetaData> column_chunk_metadata() const {
+    return row_group_metadata()->ColumnChunk(column_index_);
+  }
+
   int column_index() const { return column_index_; }
 
+  int row_group_index() const { return row_group_index_; }
+
  protected:
   int column_index_;
   ParquetFileReader* reader_;
   const SchemaDescriptor* schema_;
   std::deque<int> row_groups_;
+  int row_group_index_;
 };
 
 using FileColumnIteratorFactory =
     std::function<FileColumnIterator*(int, ParquetFileReader*)>;
 
-Status TransferColumnData(::parquet::internal::RecordReader* reader,
-                          const std::shared_ptr<::arrow::Field>& value_field,
-                          const ColumnDescriptor* descr, ::arrow::MemoryPool* pool,
-                          std::shared_ptr<::arrow::ChunkedArray>* out);
-
 struct ReaderContext {
   ParquetFileReader* reader;
   ::arrow::MemoryPool* pool;
@@ -118,5 +126,11 @@ struct ReaderContext {
   }
 };
 
+Status TransferColumnData(::parquet::internal::RecordReader* reader,
+                          std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
+                          const std::shared_ptr<::arrow::Field>& value_field,
+                          const ColumnDescriptor* descr, const ReaderContext* ctx,
+                          std::shared_ptr<::arrow::ChunkedArray>* out);
+
 }  // namespace arrow
 }  // namespace parquet

From 5a50790f76e1bd564f7e41df4b55363d43c85157 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Fri, 6 Sep 2024 08:01:04 +0800
Subject: [PATCH 145/157] Remove redundant include

---
 cpp/src/parquet/column_writer_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index 5e78f1138e7..a3de31c6c10 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -19,7 +19,6 @@
 #include <utility>
 #include <vector>
 
-#include <_types/_uint32_t.h>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 

From 51e4ab84ffa6b00cfd935a35002d67dd340e18b7 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Fri, 6 Sep 2024 09:35:33 +0800
Subject: [PATCH 146/157] Fix problems found by reviewers

---
 cpp/src/parquet/column_writer_test.cc | 26 ++++------------
 cpp/src/parquet/geometry_util.h       | 42 ++++++++++++++++++++++++--
 cpp/src/parquet/reader_test.cc        | 26 ++++------------
 cpp/src/parquet/test_util.h           | 43 +++++++++++++++++++++++++++
 4 files changed, 93 insertions(+), 44 deletions(-)

diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index a3de31c6c10..1a0398bf6d8 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -1737,23 +1737,13 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     def_levels_.resize(num_values);
     values_.resize(num_values);
 
-    uint32_t point_wkb_size = 21;
-    buffer_.resize(num_values * point_wkb_size);
+    buffer_.resize(num_values * WKB_POINT_SIZE);
     uint8_t* ptr = buffer_.data();
     for (int k = 0; k < num_values; k++) {
-      // Point with coordinates (k, k + 1), encoded as WKB
-      ptr[0] = 0x01;           // 1: little endian
-      uint32_t geom_type = 1;  // 1: POINT (2D)
-      memcpy(&ptr[1], &geom_type, 4);
-      double x = k;
-      double y = k + 1;
-      memcpy(&ptr[5], &x, 8);
-      memcpy(&ptr[13], &y, 8);
-
-      // Set this WKB value to values_[k]
-      values_[k].len = point_wkb_size;
+      GenerateWKBPoint(ptr, k, k + 1);
+      values_[k].len = WKB_POINT_SIZE;
       values_[k].ptr = ptr;
-      ptr += point_wkb_size;
+      ptr += WKB_POINT_SIZE;
     }
 
     values_ptr_ = values_.data();
@@ -1778,15 +1768,9 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     this->ReadColumn();
     for (size_t i = 0; i < num_values; i++) {
       const ByteArray& value = this->values_out_[i];
-      EXPECT_EQ(21, value.len);
-      EXPECT_EQ(1, value.ptr[0]);
-      uint32_t geom_type = 0;
       double x = 0;
       double y = 0;
-      memcpy(&geom_type, &value.ptr[1], 4);
-      memcpy(&x, &value.ptr[5], 8);
-      memcpy(&y, &value.ptr[13], 8);
-      EXPECT_EQ(1, geom_type);
+      EXPECT_TRUE(GetWKBPointCoordinate(value, &x, &y));
       EXPECT_DOUBLE_EQ(i, x);
       EXPECT_DOUBLE_EQ(i + 1, y);
     }
diff --git a/cpp/src/parquet/geometry_util.h b/cpp/src/parquet/geometry_util.h
index 4ba77c822d7..1ba4b909d3a 100644
--- a/cpp/src/parquet/geometry_util.h
+++ b/cpp/src/parquet/geometry_util.h
@@ -153,6 +153,42 @@ struct GeometryType {
     }
   }
 
+  static uint32_t ToWKB(geometry_type geometry_type, bool has_z, bool has_m) {
+    uint32_t wkb_geom_type = 0;
+    switch (geometry_type) {
+      case POINT:
+        wkb_geom_type = 1;
+        break;
+      case LINESTRING:
+        wkb_geom_type = 2;
+        break;
+      case POLYGON:
+        wkb_geom_type = 3;
+        break;
+      case MULTIPOINT:
+        wkb_geom_type = 4;
+        break;
+      case MULTILINESTRING:
+        wkb_geom_type = 5;
+        break;
+      case MULTIPOLYGON:
+        wkb_geom_type = 6;
+        break;
+      case GEOMETRYCOLLECTION:
+        wkb_geom_type = 7;
+        break;
+      default:
+        throw ParquetException("Invalid geometry_type: ", geometry_type);
+    }
+    if (has_z) {
+      wkb_geom_type += 1000;
+    }
+    if (has_m) {
+      wkb_geom_type += 2000;
+    }
+    return wkb_geom_type;
+  }
+
   static std::string ToString(geometry_type geometry_type) {
     switch (geometry_type) {
       case POINT:
@@ -177,6 +213,8 @@ struct GeometryType {
 
 class WKBBuffer {
  public:
+  enum Endianness { WKB_BIG_ENDIAN = 0, WKB_LITTLE_ENDIAN = 1 };
+
   WKBBuffer() : data_(NULLPTR), size_(0) {}
   WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {}
 
@@ -546,9 +584,9 @@ class WKBGeometryBounder {
   void ReadGeometry(WKBBuffer* src, bool record_wkb_type = true) {
     uint8_t endian = src->ReadUInt8();
 #if defined(ARROW_LITTLE_ENDIAN)
-    bool swap = endian != 0x01;
+    bool swap = endian != WKBBuffer::WKB_LITTLE_ENDIAN;
 #else
-    bool swap = endian != 0x00;
+    bool swap = endian != WKBBuffer::WKB_BIG_ENDIAN;
 #endif
 
     uint32_t wkb_geometry_type = src->ReadUInt32(swap);
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index f99d2437b28..4084b38e411 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1842,24 +1842,14 @@ TEST(TestFileReader, GeometryLogicalType) {
 
   // write WKB points to columns
   auto* writer = static_cast<ByteArrayWriter*>(rg_writer->NextColumn());
-  uint32_t point_wkb_size = 21;
-  std::vector<uint8_t> buffer(point_wkb_size * num_rows);
+  std::vector<uint8_t> buffer(test::WKB_POINT_SIZE * num_rows);
   uint8_t* ptr = buffer.data();
   std::vector<ByteArray> values(num_rows);
   for (int k = 0; k < num_rows; k++) {
-    // Point with coordinates (k, k + 1), encoded as WKB
-    ptr[0] = 0x01;           // 1: little endian
-    uint32_t geom_type = 1;  // 1: POINT (2D)
-    memcpy(&ptr[1], &geom_type, 4);
-    double x = k;
-    double y = k + 1;
-    memcpy(&ptr[5], &x, 8);
-    memcpy(&ptr[13], &y, 8);
-
-    // Set this WKB value to values_[k]
-    values[k].len = point_wkb_size;
+    test::GenerateWKBPoint(ptr, k, k + 1);
+    values[k].len = test::WKB_POINT_SIZE;
     values[k].ptr = ptr;
-    ptr += point_wkb_size;
+    ptr += test::WKB_POINT_SIZE;
   }
   writer->WriteBatch(num_rows, nullptr, nullptr, values.data());
 
@@ -1911,15 +1901,9 @@ TEST(TestFileReader, GeometryLogicalType) {
     // Check the batch
     for (int64_t i = 0; i < values_read; i++) {
       const ByteArray& value = out[i];
-      EXPECT_EQ(21, value.len);
-      EXPECT_EQ(1, value.ptr[0]);
-      uint32_t geom_type = 0;
       double x = 0;
       double y = 0;
-      memcpy(&geom_type, &value.ptr[1], 4);
-      memcpy(&x, &value.ptr[5], 8);
-      memcpy(&y, &value.ptr[13], 8);
-      EXPECT_EQ(1, geom_type);
+      EXPECT_TRUE(test::GetWKBPointCoordinate(value, &x, &y));
       EXPECT_DOUBLE_EQ(i + total_values_read, x);
       EXPECT_DOUBLE_EQ(i + 1 + total_values_read, y);
     }
diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h
index d22cea7b431..1f91fde2670 100644
--- a/cpp/src/parquet/test_util.h
+++ b/cpp/src/parquet/test_util.h
@@ -39,6 +39,7 @@
 #include "parquet/column_reader.h"
 #include "parquet/column_writer.h"
 #include "parquet/encoding.h"
+#include "parquet/geometry_util.h"
 #include "parquet/platform.h"
 
 // https://github.com/google/googletest/pull/2904 might not be available
@@ -832,5 +833,47 @@ inline void GenerateData<FLBA>(int num_values, FLBA* out, std::vector<uint8_t>*
   random_fixed_byte_array(num_values, 0, heap->data(), kGenerateDataFLBALength, out);
 }
 
+// ----------------------------------------------------------------------
+// Test utility functions for geometry
+
+static constexpr int WKB_NATIVE_ENDIANNESS =
+#if defined(ARROW_LITTLE_ENDIAN)
+    geometry::WKBBuffer::WKB_LITTLE_ENDIAN
+#else
+    geometry::WKBBuffer::WKB_BIG_ENDIAN
+#endif
+    ;
+
+static constexpr int WKB_POINT_SIZE = 21;  // 1:endianness + 4:type + 8:x + 8:y
+
+inline int GenerateWKBPoint(uint8_t* ptr, double x, double y) {
+  ptr[0] = WKB_NATIVE_ENDIANNESS;
+  uint32_t geom_type =
+      geometry::GeometryType::ToWKB(geometry::GeometryType::POINT, false, false);
+  memcpy(&ptr[1], &geom_type, 4);
+  memcpy(&ptr[5], &x, 8);
+  memcpy(&ptr[13], &y, 8);
+  return WKB_POINT_SIZE;
+}
+
+inline bool GetWKBPointCoordinate(const ByteArray& value, double* out_x, double* out_y) {
+  if (value.len != WKB_POINT_SIZE) {
+    return false;
+  }
+  if (value.ptr[0] != WKB_NATIVE_ENDIANNESS) {
+    return false;
+  }
+  uint32_t expected_geom_type =
+      geometry::GeometryType::ToWKB(geometry::GeometryType::POINT, false, false);
+  uint32_t geom_type = 0;
+  memcpy(&geom_type, &value.ptr[1], 4);
+  if (geom_type != expected_geom_type) {
+    return false;
+  }
+  memcpy(out_x, &value.ptr[5], 8);
+  memcpy(out_y, &value.ptr[13], 8);
+  return true;
+}
+
 }  // namespace test
 }  // namespace parquet

From c40c04e269f0e0da80aad19986ab5a1921cefa02 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Fri, 6 Sep 2024 10:08:32 +0800
Subject: [PATCH 147/157] Try to make it build properly on other platforms

---
 cpp/src/parquet/geometry_util.h | 70 +++++++++++++++++----------------
 cpp/src/parquet/test_util.h     |  6 +--
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/cpp/src/parquet/geometry_util.h b/cpp/src/parquet/geometry_util.h
index 1ba4b909d3a..024a131298c 100644
--- a/cpp/src/parquet/geometry_util.h
+++ b/cpp/src/parquet/geometry_util.h
@@ -53,40 +53,7 @@ struct Dimensions {
   template <dimensions dims>
   constexpr static uint32_t size();
 
-  template <>
-  constexpr uint32_t size<XY>() {
-    return 2;
-  }
-
-  template <>
-  constexpr uint32_t size<XYZ>() {
-    return 3;
-  }
-
-  template <>
-  constexpr uint32_t size<XYM>() {
-    return 3;
-  }
-
-  template <>
-  constexpr uint32_t size<XYZM>() {
-    return 4;
-  }
-
-  static uint32_t size(dimensions dims) {
-    switch (dims) {
-      case XY:
-        return size<XY>();
-      case XYZ:
-        return size<XYZ>();
-      case XYM:
-        return size<XYM>();
-      case XYZM:
-        return size<XYZM>();
-      default:
-        return 0;
-    }
-  }
+  static uint32_t size(dimensions dims);
 
   // Where to look in a coordinate with this dimension
   // for the X, Y, Z, and M dimensions, respectively.
@@ -121,6 +88,41 @@ struct Dimensions {
   }
 };
 
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XY>() {
+  return 2;
+}
+
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XYZ>() {
+  return 3;
+}
+
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XYM>() {
+  return 3;
+}
+
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XYZM>() {
+  return 4;
+}
+
+inline uint32_t Dimensions::size(dimensions dims) {
+  switch (dims) {
+    case XY:
+      return size<XY>();
+    case XYZ:
+      return size<XYZ>();
+    case XYM:
+      return size<XYM>();
+    case XYZM:
+      return size<XYZM>();
+    default:
+      return 0;
+  }
+}
+
 struct GeometryType {
   enum geometry_type {
     POINT = 1,
diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h
index 1f91fde2670..08319f524e1 100644
--- a/cpp/src/parquet/test_util.h
+++ b/cpp/src/parquet/test_util.h
@@ -836,13 +836,11 @@ inline void GenerateData<FLBA>(int num_values, FLBA* out, std::vector<uint8_t>*
 // ----------------------------------------------------------------------
 // Test utility functions for geometry
 
-static constexpr int WKB_NATIVE_ENDIANNESS =
 #if defined(ARROW_LITTLE_ENDIAN)
-    geometry::WKBBuffer::WKB_LITTLE_ENDIAN
+static constexpr int WKB_NATIVE_ENDIANNESS = geometry::WKBBuffer::WKB_LITTLE_ENDIAN;
 #else
-    geometry::WKBBuffer::WKB_BIG_ENDIAN
+static constexpr int WKB_NATIVE_ENDIANNESS = geometry::WKBBuffer::WKB_BIG_ENDIAN;
 #endif
-    ;
 
 static constexpr int WKB_POINT_SIZE = 21;  // 1:endianness + 4:type + 8:x + 8:y
 

From fb134d3af036ee5e240c7475477049c28bd6951c Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Fri, 6 Sep 2024 11:18:10 +0800
Subject: [PATCH 148/157] Address review comments in
 https://github.com/apache/arrow/pull/43196

---
 cpp/src/parquet/CMakeLists.txt                |  2 +-
 cpp/src/parquet/column_writer_test.cc         |  4 +--
 ...ometry_util.h => geometry_util_internal.h} |  0
 ...test.cc => geometry_util_internal_test.cc} |  2 +-
 cpp/src/parquet/statistics.cc                 | 34 ++++++++++++++++++-
 cpp/src/parquet/statistics.h                  |  4 +++
 cpp/src/parquet/test_util.h                   |  2 +-
 cpp/src/parquet/types.cc                      |  8 +++++
 8 files changed, 50 insertions(+), 6 deletions(-)
 rename cpp/src/parquet/{geometry_util.h => geometry_util_internal.h} (100%)
 rename cpp/src/parquet/{geometry_util_test.cc => geometry_util_internal_test.cc} (99%)

diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 6a00394659f..a6e92eabceb 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -379,7 +379,7 @@ add_parquet_test(internals-test
                  public_api_test.cc
                  types_test.cc)
 
-add_parquet_test(geometry-test SOURCES geometry_util_test.cc)
+add_parquet_test(geometry-test SOURCES geometry_util_internal_test.cc)
 
 set_source_files_properties(public_api_test.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON
                                                           SKIP_UNITY_BUILD_INCLUSION ON)
diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index a976e9776ce..225bcf4c65e 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -398,7 +398,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
         ColumnChunkMetaData::Make(metadata_->contents(), this->descr_);
     return metadata_accessor->key_value_metadata();
   }
-  
+
   EncodedStatistics metadata_encoded_stats() {
     ApplicationVersion app_version(this->writer_properties_->created_by());
     auto metadata_accessor = ColumnChunkMetaData::Make(
@@ -1780,7 +1780,7 @@ TEST_F(TestInt32Writer, WriteKeyValueMetadataEndToEnd) {
   ASSERT_OK_AND_ASSIGN(auto value, key_value_metadata->Get("foo"));
   ASSERT_EQ("bar", value);
 }
-  
+
 // Test writing and reading geometry columns
 class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
  public:
diff --git a/cpp/src/parquet/geometry_util.h b/cpp/src/parquet/geometry_util_internal.h
similarity index 100%
rename from cpp/src/parquet/geometry_util.h
rename to cpp/src/parquet/geometry_util_internal.h
diff --git a/cpp/src/parquet/geometry_util_test.cc b/cpp/src/parquet/geometry_util_internal_test.cc
similarity index 99%
rename from cpp/src/parquet/geometry_util_test.cc
rename to cpp/src/parquet/geometry_util_internal_test.cc
index b92c9b6a8d5..66d7639d6f8 100644
--- a/cpp/src/parquet/geometry_util_test.cc
+++ b/cpp/src/parquet/geometry_util_internal_test.cc
@@ -20,7 +20,7 @@
 
 #include "arrow/testing/gtest_compat.h"
 
-#include "parquet/geometry_util.h"
+#include "parquet/geometry_util_internal.h"
 
 namespace parquet::geometry {
 
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index af40838e8db..87dc56aa693 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -37,7 +37,7 @@
 #include "arrow/visit_data_inline.h"
 #include "parquet/encoding.h"
 #include "parquet/exception.h"
-#include "parquet/geometry_util.h"
+#include "parquet/geometry_util_internal.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
 #include "parquet/types.h"
@@ -109,6 +109,28 @@ class GeometryStatisticsImpl {
     }
   }
 
+  void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
+                    int64_t valid_bits_offset, int64_t num_spaced_values,
+                    int64_t num_values, int64_t null_count) {
+    DCHECK_GT(num_spaced_values, 0);
+
+    geometry::WKBBuffer buf;
+    try {
+      ::arrow::internal::VisitSetBitRunsVoid(
+          valid_bits, valid_bits_offset, num_spaced_values,
+          [&](int64_t position, int64_t length) {
+            for (int64_t i = 0; i < num_spaced_values; i++) {
+              ByteArray item = SafeLoad(values + i + position);
+              buf.Init(item.ptr, item.len);
+              bounder_.ReadGeometry(&buf);
+            }
+          });
+      bounder_.Flush();
+    } catch (ParquetException& e) {
+      is_valid_ = false;
+    }
+  }
+
   EncodedGeometryStatistics Encode() const {
     const double* mins = bounder_.Bounds().min;
     const double* maxes = bounder_.Bounds().max;
@@ -1093,6 +1115,16 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* va
   if (num_values == 0) return;
   SetMinMaxPair(comparator_->GetMinMaxSpaced(values, num_spaced_values, valid_bits,
                                              valid_bits_offset));
+
+  if constexpr (std::is_same<T, ByteArray>::value) {
+    if (logical_type_ == LogicalType::Type::GEOMETRY) {
+      if (geometry_statistics_ == nullptr) {
+        geometry_statistics_ = std::make_unique<GeometryStatistics>();
+      }
+      geometry_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset,
+                                         num_spaced_values, num_values, null_count);
+    }
+  }
 }
 
 template <typename DType>
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index 4e785d6a15f..072970e9133 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -153,6 +153,10 @@ class PARQUET_EXPORT GeometryStatistics {
 
   void Update(const ByteArray* values, int64_t num_values, int64_t null_count);
 
+  void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
+                    int64_t valid_bits_offset, int64_t num_spaced_values,
+                    int64_t num_values, int64_t null_count);
+
   EncodedGeometryStatistics Encode();
 
   bool is_valid() const;
diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h
index 08319f524e1..2d38b27fe01 100644
--- a/cpp/src/parquet/test_util.h
+++ b/cpp/src/parquet/test_util.h
@@ -39,7 +39,7 @@
 #include "parquet/column_reader.h"
 #include "parquet/column_writer.h"
 #include "parquet/encoding.h"
-#include "parquet/geometry_util.h"
+#include "parquet/geometry_util_internal.h"
 #include "parquet/platform.h"
 
 // https://github.com/google/googletest/pull/2904 might not be available
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index db8400a80c0..5dd3d79eda0 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -474,12 +474,20 @@ std::shared_ptr<const LogicalType> LogicalType::FromThrift(
       edges = LogicalType::GeometryEdges::PLANAR;
     } else if (type.GEOMETRY.edges == format::Edges::SPHERICAL) {
       edges = LogicalType::GeometryEdges::SPHERICAL;
+    } else {
+      std::stringstream ss;
+      ss << "Unknown value for geometry edges: " << type.GEOMETRY.edges;
+      throw ParquetException(ss.str());
     }
 
     LogicalType::GeometryEncoding::geometry_encoding encoding =
         LogicalType::GeometryEncoding::UNKNOWN;
     if (type.GEOMETRY.encoding == format::GeometryEncoding::WKB) {
       encoding = LogicalType::GeometryEncoding::WKB;
+    } else {
+      std::stringstream ss;
+      ss << "Unknown value for geometry encoding: " << type.GEOMETRY.edges;
+      throw ParquetException(ss.str());
     }
 
     std::string metadata;

From 2f4329e21d07ca159a499e6f8fe825e0c507fba4 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Fri, 6 Sep 2024 12:06:26 +0800
Subject: [PATCH 149/157] Resolve compile errors for MSVC

---
 cpp/src/parquet/column_writer_test.cc |  6 ++++--
 cpp/src/parquet/reader_test.cc        |  6 ++++--
 cpp/src/parquet/statistics.cc         | 16 +++++++++++++---
 cpp/src/parquet/statistics.h          |  2 ++
 4 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index 225bcf4c65e..38f7eb67c54 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -1840,8 +1840,10 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
       double x = 0;
       double y = 0;
       EXPECT_TRUE(GetWKBPointCoordinate(value, &x, &y));
-      EXPECT_DOUBLE_EQ(i, x);
-      EXPECT_DOUBLE_EQ(i + 1, y);
+      double expected_x = i;
+      double expected_y = i + 1;
+      EXPECT_DOUBLE_EQ(expected_x, x);
+      EXPECT_DOUBLE_EQ(expected_y, y);
     }
 
     auto encoded_statistics = metadata_encoded_stats();
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 4084b38e411..dd1738f890b 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1904,8 +1904,10 @@ TEST(TestFileReader, GeometryLogicalType) {
       double x = 0;
       double y = 0;
       EXPECT_TRUE(test::GetWKBPointCoordinate(value, &x, &y));
-      EXPECT_DOUBLE_EQ(i + total_values_read, x);
-      EXPECT_DOUBLE_EQ(i + 1 + total_values_read, y);
+      double expected_x = i + total_values_read;
+      double expected_y = i + 1 + total_values_read;
+      EXPECT_DOUBLE_EQ(expected_x, x);
+      EXPECT_DOUBLE_EQ(expected_y, y);
     }
 
     total_values_read += values_read;
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index 87dc56aa693..e83d2ae37bb 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -104,7 +104,7 @@ class GeometryStatisticsImpl {
       }
 
       bounder_.Flush();
-    } catch (ParquetException& e) {
+    } catch (ParquetException&) {
       is_valid_ = false;
     }
   }
@@ -126,7 +126,7 @@ class GeometryStatisticsImpl {
             }
           });
       bounder_.Flush();
-    } catch (ParquetException& e) {
+    } catch (ParquetException&) {
       is_valid_ = false;
     }
   }
@@ -183,7 +183,7 @@ class GeometryStatisticsImpl {
           bounder_.ReadGeometry(&buf, false);
         }
       }
-    } catch (ParquetException& e) {
+    } catch (ParquetException&) {
       is_valid_ = false;
       return;
     }
@@ -203,6 +203,8 @@ GeometryStatistics::GeometryStatistics() {
 GeometryStatistics::GeometryStatistics(std::unique_ptr<GeometryStatisticsImpl> impl)
     : impl_(std::move(impl)) {}
 
+GeometryStatistics::~GeometryStatistics() = default;
+
 bool GeometryStatistics::Equals(const GeometryStatistics& other) const {
   return impl_->Equals(*other.impl_);
 }
@@ -216,6 +218,14 @@ void GeometryStatistics::Update(const ByteArray* values, int64_t num_values,
   impl_->Update(values, num_values, null_count);
 }
 
+void GeometryStatistics::UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
+                                      int64_t valid_bits_offset,
+                                      int64_t num_spaced_values, int64_t num_values,
+                                      int64_t null_count) {
+  impl_->UpdateSpaced(values, valid_bits, valid_bits_offset, num_spaced_values,
+                      num_values, null_count);
+}
+
 bool GeometryStatistics::is_valid() const { return impl_->is_valid(); }
 
 EncodedGeometryStatistics GeometryStatistics::Encode() { return impl_->Encode(); }
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index 072970e9133..eaa3cacc5d5 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -147,6 +147,8 @@ class PARQUET_EXPORT GeometryStatistics {
   GeometryStatistics();
   explicit GeometryStatistics(std::unique_ptr<GeometryStatisticsImpl> impl);
 
+  ~GeometryStatistics();
+
   bool Equals(const GeometryStatistics& other) const;
 
   void Merge(const GeometryStatistics& other);

From 1db855f0a648dcca5174fe7b1861ae99661b6740 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Tue, 10 Sep 2024 19:09:07 +0800
Subject: [PATCH 150/157] Expose getters in GeometryStatistics, Change
 geometry_types from std::vector<uint32_t> to std::vector<int32_t>, several
 other minor fixes.

---
 cpp/src/parquet/column_writer_test.cc         | 29 ++++----
 cpp/src/parquet/geometry_util_internal.h      | 16 ++--
 .../parquet/geometry_util_internal_test.cc    |  6 +-
 cpp/src/parquet/metadata.cc                   |  3 +-
 cpp/src/parquet/reader_test.cc                |  4 +-
 cpp/src/parquet/statistics.cc                 | 74 +++++++++++++++++--
 cpp/src/parquet/statistics.h                  | 28 +++++--
 cpp/src/parquet/thrift_internal.h             |  4 +-
 8 files changed, 122 insertions(+), 42 deletions(-)

diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index 38f7eb67c54..37599cf4f4c 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -399,11 +399,11 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
     return metadata_accessor->key_value_metadata();
   }
 
-  EncodedStatistics metadata_encoded_stats() {
+  std::shared_ptr<Statistics> metadata_stats() {
     ApplicationVersion app_version(this->writer_properties_->created_by());
     auto metadata_accessor = ColumnChunkMetaData::Make(
         metadata_->contents(), this->descr_, default_reader_properties(), &app_version);
-    return metadata_accessor->statistics()->Encode();
+    return metadata_accessor->statistics();
   }
 
  protected:
@@ -1840,21 +1840,24 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
       double x = 0;
       double y = 0;
       EXPECT_TRUE(GetWKBPointCoordinate(value, &x, &y));
-      double expected_x = i;
-      double expected_y = i + 1;
+      auto expected_x = static_cast<double>(i);
+      auto expected_y = static_cast<double>(i + 1);
       EXPECT_DOUBLE_EQ(expected_x, x);
       EXPECT_DOUBLE_EQ(expected_y, y);
     }
 
-    auto encoded_statistics = metadata_encoded_stats();
-    EXPECT_TRUE(encoded_statistics.has_geometry_statistics);
-    auto geometry_statistics = encoded_statistics.geometry_statistics();
-    EXPECT_EQ(1, geometry_statistics.geometry_types.size());
-    EXPECT_EQ(1, geometry_statistics.geometry_types[0]);
-    EXPECT_DOUBLE_EQ(0, geometry_statistics.xmin);
-    EXPECT_DOUBLE_EQ(1, geometry_statistics.ymin);
-    EXPECT_DOUBLE_EQ(99, geometry_statistics.xmax);
-    EXPECT_DOUBLE_EQ(100, geometry_statistics.ymax);
+    std::shared_ptr<Statistics> statistics = metadata_stats();
+    EXPECT_TRUE(statistics->HasGeometryStatistics());
+    const GeometryStatistics* geometry_statistics = statistics->geometry_statistics();
+    std::vector<int32_t> geometry_types = geometry_statistics->GetGeometryTypes();
+    EXPECT_EQ(1, geometry_types.size());
+    EXPECT_EQ(1, geometry_types[0]);
+    EXPECT_DOUBLE_EQ(0, geometry_statistics->GetXMin());
+    EXPECT_DOUBLE_EQ(1, geometry_statistics->GetYMin());
+    EXPECT_DOUBLE_EQ(99, geometry_statistics->GetXMax());
+    EXPECT_DOUBLE_EQ(100, geometry_statistics->GetYMax());
+    EXPECT_FALSE(geometry_statistics->HasZ());
+    EXPECT_FALSE(geometry_statistics->HasM());
   }
 };
 
diff --git a/cpp/src/parquet/geometry_util_internal.h b/cpp/src/parquet/geometry_util_internal.h
index 024a131298c..0ad02b073af 100644
--- a/cpp/src/parquet/geometry_util_internal.h
+++ b/cpp/src/parquet/geometry_util_internal.h
@@ -597,7 +597,9 @@ class WKBGeometryBounder {
 
     // Keep track of geometry types encountered if at the top level
     if (record_wkb_type) {
-      wkb_types_.insert(wkb_geometry_type);
+      GeometryType::geometry_type geometry_type =
+          GeometryType::FromWKB(wkb_geometry_type);
+      geometry_types_.insert(geometry_type);
     }
 
     switch (geometry_type) {
@@ -629,14 +631,14 @@ class WKBGeometryBounder {
 
   void ReadBox(const BoundingBox& box) { box_.Merge(box); }
 
-  void ReadGeometryTypes(const std::vector<uint32_t>& geometry_types) {
-    wkb_types_.insert(geometry_types.begin(), geometry_types.end());
+  void ReadGeometryTypes(const std::vector<int32_t>& geometry_types) {
+    geometry_types_.insert(geometry_types.begin(), geometry_types.end());
   }
 
   const BoundingBox& Bounds() const { return box_; }
 
-  std::vector<uint32_t> WkbTypes() const {
-    std::vector<uint32_t> out(wkb_types_.begin(), wkb_types_.end());
+  std::vector<int32_t> GeometryTypes() const {
+    std::vector<int32_t> out(geometry_types_.begin(), geometry_types_.end());
     std::sort(out.begin(), out.end());
     return out;
   }
@@ -646,13 +648,13 @@ class WKBGeometryBounder {
   void Reset() {
     box_.Reset();
     bounder_.Reset();
-    wkb_types_.clear();
+    geometry_types_.clear();
   }
 
  private:
   BoundingBox box_;
   WKBGenericSequenceBounder bounder_;
-  std::unordered_set<uint32_t> wkb_types_;
+  std::unordered_set<int32_t> geometry_types_;
 };
 
 }  // namespace parquet::geometry
diff --git a/cpp/src/parquet/geometry_util_internal_test.cc b/cpp/src/parquet/geometry_util_internal_test.cc
index 66d7639d6f8..4d4c907f37e 100644
--- a/cpp/src/parquet/geometry_util_internal_test.cc
+++ b/cpp/src/parquet/geometry_util_internal_test.cc
@@ -142,12 +142,12 @@ TEST_P(WKBTestFixture, TestWKBBounderNonEmpty) {
 
   bounder.Flush();
   EXPECT_EQ(bounder.Bounds(), item.box);
-  uint32_t wkb_type = item.dimensions * 1000 + item.geometry_type;
-  EXPECT_THAT(bounder.WkbTypes(), ::testing::ElementsAre(::testing::Eq(wkb_type)));
+  EXPECT_THAT(bounder.GeometryTypes(),
+              ::testing::ElementsAre(::testing::Eq(item.geometry_type)));
 
   bounder.Reset();
   EXPECT_EQ(bounder.Bounds(), BoundingBox());
-  EXPECT_TRUE(bounder.WkbTypes().empty());
+  EXPECT_TRUE(bounder.GeometryTypes().empty());
 }
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 7c69b97e2d4..b58a6b0e6f5 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -94,8 +94,7 @@ static EncodedGeometryStatistics MakeEncodedGeometryStatistics(
 
   if (stats.__isset.geometry_stats) {
     const format::GeometryStatistics& geom_stats = stats.geometry_stats;
-    out.geometry_types = std::vector<uint32_t>(geom_stats.geometry_types.begin(),
-                                               geom_stats.geometry_types.end());
+    out.geometry_types = geom_stats.geometry_types;
 
     out.xmin = geom_stats.bbox.xmin;
     out.xmax = geom_stats.bbox.xmax;
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index dd1738f890b..b28fec1b717 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1904,8 +1904,8 @@ TEST(TestFileReader, GeometryLogicalType) {
       double x = 0;
       double y = 0;
       EXPECT_TRUE(test::GetWKBPointCoordinate(value, &x, &y));
-      double expected_x = i + total_values_read;
-      double expected_y = i + 1 + total_values_read;
+      auto expected_x = static_cast<double>(i + total_values_read);
+      auto expected_y = static_cast<double>(i + 1 + total_values_read);
       EXPECT_DOUBLE_EQ(expected_x, x);
       EXPECT_DOUBLE_EQ(expected_y, y);
     }
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index e83d2ae37bb..a92974b3080 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -65,14 +65,14 @@ class GeometryStatisticsImpl {
       return true;
     }
 
-    auto wkb_types = bounder_.WkbTypes();
-    auto other_wkb_types = other.bounder_.WkbTypes();
-    if (wkb_types.size() != other_wkb_types.size()) {
+    auto geometry_types = bounder_.GeometryTypes();
+    auto other_geometry_types = other.bounder_.GeometryTypes();
+    if (geometry_types.size() != other_geometry_types.size()) {
       return false;
     }
 
-    for (size_t i = 0; i < wkb_types.size(); i++) {
-      if (wkb_types[i] != other_wkb_types[i]) {
+    for (size_t i = 0; i < geometry_types.size(); i++) {
+      if (geometry_types[i] != other_geometry_types[i]) {
         return false;
       }
     }
@@ -87,7 +87,7 @@ class GeometryStatisticsImpl {
     }
 
     bounder_.ReadBox(other.bounder_.Bounds());
-    bounder_.ReadGeometryTypes(other.bounder_.WkbTypes());
+    bounder_.ReadGeometryTypes(other.bounder_.GeometryTypes());
   }
 
   void Update(const ByteArray* values, int64_t num_values, int64_t null_count) {
@@ -136,7 +136,7 @@ class GeometryStatisticsImpl {
     const double* maxes = bounder_.Bounds().max;
 
     EncodedGeometryStatistics out;
-    out.geometry_types = bounder_.WkbTypes();
+    out.geometry_types = bounder_.GeometryTypes();
 
     out.xmin = mins[0];
     out.xmax = maxes[0];
@@ -191,6 +191,12 @@ class GeometryStatisticsImpl {
 
   bool is_valid() const { return is_valid_; }
 
+  const double* GetMinBounds() { return bounder_.Bounds().min; }
+
+  const double* GetMaxBounds() { return bounder_.Bounds().max; }
+
+  std::vector<int32_t> GetGeometryTypes() const { return bounder_.GeometryTypes(); }
+
  private:
   geometry::WKBGeometryBounder bounder_;
   bool is_valid_ = true;
@@ -240,6 +246,60 @@ std::shared_ptr<GeometryStatistics> GeometryStatistics::clone() const {
   return std::make_shared<GeometryStatistics>(std::move(impl));
 }
 
+double GeometryStatistics::GetXMin() const {
+  const double* mins = impl_->GetMinBounds();
+  return mins[0];
+}
+
+double GeometryStatistics::GetXMax() const {
+  const double* maxes = impl_->GetMaxBounds();
+  return maxes[0];
+}
+
+double GeometryStatistics::GetYMin() const {
+  const double* mins = impl_->GetMinBounds();
+  return mins[1];
+}
+
+double GeometryStatistics::GetYMax() const {
+  const double* maxes = impl_->GetMaxBounds();
+  return maxes[1];
+}
+
+double GeometryStatistics::GetZMin() const {
+  const double* mins = impl_->GetMinBounds();
+  return mins[2];
+}
+
+double GeometryStatistics::GetZMax() const {
+  const double* maxes = impl_->GetMaxBounds();
+  return maxes[2];
+}
+
+double GeometryStatistics::GetMMin() const {
+  const double* mins = impl_->GetMinBounds();
+  return mins[3];
+}
+
+double GeometryStatistics::GetMMax() const {
+  const double* maxes = impl_->GetMaxBounds();
+  return maxes[3];
+}
+
+bool GeometryStatistics::HasZ() const { return (GetZMax() - GetZMin()) > 0; }
+
+bool GeometryStatistics::HasM() const { return (GetMMax() - GetMMin()) > 0; }
+
+std::vector<int32_t> GeometryStatistics::GetGeometryTypes() const {
+  return impl_->GetGeometryTypes();
+}
+
+std::vector<std::pair<std::string, std::string>> GeometryStatistics::GetCoverings()
+    const {
+  // TODO (kontinuation): support coverings
+  return {};
+}
+
 namespace {
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index eaa3cacc5d5..2a395da97a9 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -133,11 +133,11 @@ class PARQUET_EXPORT EncodedGeometryStatistics {
   double mmin{kInf};
   double mmax{-kInf};
   std::vector<std::pair<std::string, std::string>> coverings;
-  std::vector<uint32_t> geometry_types;
+  std::vector<int32_t> geometry_types;
 
-  bool has_z() const { return (zmax - zmin) > 0; }
+  bool has_z() const { return (zmax - zmin) >= 0; }
 
-  bool has_m() const { return (mmax - mmin) > 0; }
+  bool has_m() const { return (mmax - mmin) >= 0; }
 };
 
 class GeometryStatisticsImpl;
@@ -167,6 +167,21 @@ class PARQUET_EXPORT GeometryStatistics {
 
   void Decode(const EncodedGeometryStatistics& encoded);
 
+  double GetXMin() const;
+  double GetXMax() const;
+  double GetYMin() const;
+  double GetYMax() const;
+  double GetZMin() const;
+  double GetZMax() const;
+  double GetMMin() const;
+  double GetMMax() const;
+
+  bool HasZ() const;
+  bool HasM() const;
+
+  std::vector<int32_t> GetGeometryTypes() const;
+  std::vector<std::pair<std::string, std::string>> GetCoverings() const;
+
  private:
   std::unique_ptr<GeometryStatisticsImpl> impl_;
 };
@@ -178,7 +193,6 @@ class PARQUET_EXPORT GeometryStatistics {
 class PARQUET_EXPORT EncodedStatistics {
   std::string max_, min_;
   bool is_signed_ = false;
-  EncodedGeometryStatistics geometry_statistics_;
 
  public:
   EncodedStatistics() = default;
@@ -196,7 +210,6 @@ class PARQUET_EXPORT EncodedStatistics {
   bool has_max = false;
   bool has_null_count = false;
   bool has_distinct_count = false;
-  bool has_geometry_statistics = false;
 
   // When all values in the statistics are null, it is set to true.
   // Otherwise, at least one value is not null, or we are not sure at all.
@@ -204,6 +217,11 @@ class PARQUET_EXPORT EncodedStatistics {
   // is a null page or not.
   bool all_null_value = false;
 
+  // Statistics for geometry column. geometry_statistics_ is only valid when
+  // has_geometry_statistics is true.
+  EncodedGeometryStatistics geometry_statistics_;
+  bool has_geometry_statistics = false;
+
   // From parquet-mr
   // Don't write stats larger than the max size rather than truncating. The
   // rationale is that some engines may use the minimum value in the page as
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index 7b9e37aa1ea..92aba0ef30b 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -351,9 +351,7 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
   if (stats.has_geometry_statistics) {
     const EncodedGeometryStatistics& encoded_geometry_stats = stats.geometry_statistics();
     format::GeometryStatistics geometry_statistics;
-    std::vector<int32_t> geometry_types(encoded_geometry_stats.geometry_types.begin(),
-                                        encoded_geometry_stats.geometry_types.end());
-    geometry_statistics.__set_geometry_types(geometry_types);
+    geometry_statistics.__set_geometry_types(encoded_geometry_stats.geometry_types);
     format::BoundingBox bbox;
     bbox.__set_xmin(encoded_geometry_stats.xmin);
     bbox.__set_xmax(encoded_geometry_stats.xmax);

From ad92bb6d113121565719fc12b9955bdf6775758c Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Wed, 11 Sep 2024 15:59:10 +0800
Subject: [PATCH 151/157] Add test case for UpdateSpaced, don't generate
 min/max stats for geometry columns.

---
 cpp/src/parquet/column_writer_test.cc | 84 ++++++++++++++++++++++++---
 cpp/src/parquet/reader_test.cc        | 23 ++++----
 cpp/src/parquet/statistics.cc         | 54 +++++++++++++++--
 cpp/src/parquet/statistics.h          |  2 +
 4 files changed, 141 insertions(+), 22 deletions(-)

diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index 37599cf4f4c..ee0cc30e97d 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -1803,7 +1803,6 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
   }
 
   void GenerateData(int64_t num_values, uint32_t seed = 0) {
-    def_levels_.resize(num_values);
     values_.resize(num_values);
 
     buffer_.resize(num_values * WKB_POINT_SIZE);
@@ -1816,8 +1815,6 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     }
 
     values_ptr_ = values_.data();
-
-    std::fill(def_levels_.begin(), def_levels_.end(), 1);
   }
 
   void TestWriteAndRead(ParquetVersion::type version,
@@ -1828,10 +1825,7 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     auto writer =
         this->BuildWriter(num_values, ColumnProperties(), version, data_page_version,
                           /*enable_checksum*/ false);
-    std::vector<int16_t> definition_levels(num_values, 0);
-    std::vector<int16_t> repetition_levels(num_values, 0);
-    writer->WriteBatch(this->values_.size(), definition_levels.data(),
-                       repetition_levels.data(), this->values_.data());
+    writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_.data());
 
     writer->Close();
     this->ReadColumn();
@@ -1847,6 +1841,7 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     }
 
     std::shared_ptr<Statistics> statistics = metadata_stats();
+    EXPECT_FALSE(statistics->HasMinMax());
     EXPECT_TRUE(statistics->HasGeometryStatistics());
     const GeometryStatistics* geometry_statistics = statistics->geometry_statistics();
     std::vector<int32_t> geometry_types = geometry_statistics->GetGeometryTypes();
@@ -1859,6 +1854,67 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     EXPECT_FALSE(geometry_statistics->HasZ());
     EXPECT_FALSE(geometry_statistics->HasM());
   }
+
+  void TestWriteAndReadSpaced(ParquetVersion::type version,
+                              ParquetDataPageVersion data_page_version) {
+    this->SetUpSchema(Repetition::OPTIONAL, 1);
+    this->GenerateData(SMALL_SIZE);
+    size_t num_values = this->values_.size();
+
+    std::vector<int16_t> definition_levels(num_values, 1);
+    std::vector<int16_t> repetition_levels(num_values, 0);
+    std::vector<size_t> non_null_indices;
+
+    // Replace some of the generated data with NULL
+    for (size_t i = 0; i < num_values; i++) {
+      if (i % 3 == 0) {
+        definition_levels[i] = 0;
+      } else {
+        non_null_indices.push_back(i);
+      }
+    }
+
+    // Construct valid bits using definition levels
+    std::vector<uint8_t> valid_bytes =
+        std::vector<uint8_t>(definition_levels.begin(), definition_levels.end());
+    std::shared_ptr<Buffer> valid_bits;
+    ASSERT_OK_AND_ASSIGN(valid_bits, ::arrow::internal::BytesToBits(valid_bytes));
+
+    auto writer =
+        this->BuildWriter(num_values, ColumnProperties(), version, data_page_version,
+                          /*enable_checksum*/ false);
+    writer->WriteBatchSpaced(this->values_.size(), definition_levels.data(),
+                             repetition_levels.data(), valid_bits->data(), 0,
+                             this->values_.data());
+
+    writer->Close();
+    this->ReadColumn();
+    size_t expected_values_read = non_null_indices.size();
+    EXPECT_EQ(expected_values_read, values_read_);
+    for (int64_t i = 0; i < values_read_; i++) {
+      const ByteArray& value = this->values_out_[i];
+      double x = 0;
+      double y = 0;
+      EXPECT_TRUE(GetWKBPointCoordinate(value, &x, &y));
+      auto expected_x = static_cast<double>(non_null_indices[i]);
+      auto expected_y = static_cast<double>(non_null_indices[i] + 1);
+      EXPECT_DOUBLE_EQ(expected_x, x);
+      EXPECT_DOUBLE_EQ(expected_y, y);
+    }
+
+    std::shared_ptr<Statistics> statistics = metadata_stats();
+    EXPECT_TRUE(statistics->HasGeometryStatistics());
+    const GeometryStatistics* geometry_statistics = statistics->geometry_statistics();
+    std::vector<int32_t> geometry_types = geometry_statistics->GetGeometryTypes();
+    EXPECT_EQ(1, geometry_types.size());
+    EXPECT_EQ(1, geometry_types[0]);
+    EXPECT_DOUBLE_EQ(1, geometry_statistics->GetXMin());
+    EXPECT_DOUBLE_EQ(2, geometry_statistics->GetYMin());
+    EXPECT_DOUBLE_EQ(98, geometry_statistics->GetXMax());
+    EXPECT_DOUBLE_EQ(99, geometry_statistics->GetYMax());
+    EXPECT_FALSE(geometry_statistics->HasZ());
+    EXPECT_FALSE(geometry_statistics->HasM());
+  }
 };
 
 const char* TestGeometryValuesWriter::CRS =
@@ -1879,5 +1935,19 @@ TEST_F(TestGeometryValuesWriter, TestWriteAndReadV2) {
   }
 }
 
+TEST_F(TestGeometryValuesWriter, TestWriteAndReadV1Spaced) {
+  for (auto data_page_version :
+       {ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) {
+    TestWriteAndReadSpaced(ParquetVersion::PARQUET_1_0, data_page_version);
+  }
+}
+
+TEST_F(TestGeometryValuesWriter, TestWriteAndReadV2Spaced) {
+  for (auto data_page_version :
+       {ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) {
+    TestWriteAndReadSpaced(ParquetVersion::PARQUET_2_4, data_page_version);
+  }
+}
+
 }  // namespace test
 }  // namespace parquet
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index b28fec1b717..c5af6ae6368 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1873,16 +1873,19 @@ TEST(TestFileReader, GeometryLogicalType) {
     std::unique_ptr<RowGroupMetaData> row_group_metadata = metadata->RowGroup(i);
     std::unique_ptr<ColumnChunkMetaData> column_chunk_metadata =
         row_group_metadata->ColumnChunk(0);
-    EncodedStatistics encoded_statistics = column_chunk_metadata->statistics()->Encode();
-    EXPECT_TRUE(encoded_statistics.has_geometry_statistics);
-    const EncodedGeometryStatistics& geom_stats =
-        encoded_statistics.geometry_statistics();
-    EXPECT_EQ(1, geom_stats.geometry_types.size());
-    EXPECT_EQ(1, geom_stats.geometry_types[0]);
-    EXPECT_GE(geom_stats.xmin, 0);
-    EXPECT_GT(geom_stats.xmax, geom_stats.xmin);
-    EXPECT_GT(geom_stats.ymin, 0);
-    EXPECT_GT(geom_stats.ymax, geom_stats.ymin);
+    std::shared_ptr<Statistics> statistics = column_chunk_metadata->statistics();
+    EXPECT_FALSE(statistics->HasMinMax());
+    EXPECT_TRUE(statistics->HasGeometryStatistics());
+    const GeometryStatistics* geom_stats = statistics->geometry_statistics();
+    std::vector<int32_t> geometry_types = geom_stats->GetGeometryTypes();
+    EXPECT_EQ(1, geometry_types.size());
+    EXPECT_EQ(1, geometry_types[0]);
+    EXPECT_GE(geom_stats->GetXMin(), 0);
+    EXPECT_GT(geom_stats->GetXMax(), geom_stats->GetXMin());
+    EXPECT_GT(geom_stats->GetYMin(), 0);
+    EXPECT_GT(geom_stats->GetYMax(), geom_stats->GetYMin());
+    EXPECT_FALSE(geom_stats->HasZ());
+    EXPECT_FALSE(geom_stats->HasM());
   }
 
   // Check the geometry values
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index a92974b3080..02074bfecfa 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -27,12 +27,14 @@
 #include <utility>
 
 #include "arrow/array.h"
+#include "arrow/array/array_binary.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/float16.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
 #include "arrow/util/ubsan.h"
 #include "arrow/visit_data_inline.h"
 #include "parquet/encoding.h"
@@ -119,7 +121,7 @@ class GeometryStatisticsImpl {
       ::arrow::internal::VisitSetBitRunsVoid(
           valid_bits, valid_bits_offset, num_spaced_values,
           [&](int64_t position, int64_t length) {
-            for (int64_t i = 0; i < num_spaced_values; i++) {
+            for (int64_t i = 0; i < length; i++) {
               ByteArray item = SafeLoad(values + i + position);
               buf.Init(item.ptr, item.len);
               bounder_.ReadGeometry(&buf);
@@ -131,6 +133,26 @@ class GeometryStatisticsImpl {
     }
   }
 
+  void Update(const ::arrow::Array& values, bool update_counts) {
+    ARROW_UNUSED(update_counts);
+
+    const auto& binary_array = static_cast<const ::arrow::BinaryArray&>(values);
+    geometry::WKBBuffer buf;
+    try {
+      for (int64_t i = 0; i < binary_array.length(); ++i) {
+        if (!binary_array.IsNull(i)) {
+          std::string_view byte_array = binary_array.GetView(i);
+          buf.Init(reinterpret_cast<const uint8_t*>(byte_array.data()),
+                   byte_array.length());
+          bounder_.ReadGeometry(&buf);
+          bounder_.Flush();
+        }
+      }
+    } catch (ParquetException&) {
+      is_valid_ = false;
+    }
+  }
+
   EncodedGeometryStatistics Encode() const {
     const double* mins = bounder_.Bounds().min;
     const double* maxes = bounder_.Bounds().max;
@@ -232,6 +254,10 @@ void GeometryStatistics::UpdateSpaced(const ByteArray* values, const uint8_t* va
                       num_values, null_count);
 }
 
+void GeometryStatistics::Update(const ::arrow::Array& values, bool update_counts) {
+  impl_->Update(values, update_counts);
+}
+
 bool GeometryStatistics::is_valid() const { return impl_->is_valid(); }
 
 EncodedGeometryStatistics GeometryStatistics::Encode() { return impl_->Encode(); }
@@ -1002,7 +1028,18 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
       return;
     }
 
-    SetMinMaxPair(comparator_->GetMinMax(values));
+    if constexpr (std::is_same<T, ByteArray>::value) {
+      if (logical_type_ == LogicalType::Type::GEOMETRY) {
+        if (geometry_statistics_ == nullptr) {
+          geometry_statistics_ = std::make_unique<GeometryStatistics>();
+        }
+        geometry_statistics_->Update(values, update_counts);
+      } else {
+        SetMinMaxPair(comparator_->GetMinMax(values));
+      }
+    } else {
+      SetMinMaxPair(comparator_->GetMinMax(values));
+    }
   }
 
   const T& min() const override { return min_; }
@@ -1159,7 +1196,6 @@ void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_values,
   IncrementNumValues(num_values);
 
   if (num_values == 0) return;
-  SetMinMaxPair(comparator_->GetMinMax(values, num_values));
 
   if constexpr (std::is_same<T, ByteArray>::value) {
     if (logical_type_ == LogicalType::Type::GEOMETRY) {
@@ -1167,7 +1203,11 @@ void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_values,
         geometry_statistics_ = std::make_unique<GeometryStatistics>();
       }
       geometry_statistics_->Update(values, num_values, null_count);
+    } else {
+      SetMinMaxPair(comparator_->GetMinMax(values, num_values));
     }
+  } else {
+    SetMinMaxPair(comparator_->GetMinMax(values, num_values));
   }
 }
 
@@ -1183,8 +1223,6 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* va
   IncrementNumValues(num_values);
 
   if (num_values == 0) return;
-  SetMinMaxPair(comparator_->GetMinMaxSpaced(values, num_spaced_values, valid_bits,
-                                             valid_bits_offset));
 
   if constexpr (std::is_same<T, ByteArray>::value) {
     if (logical_type_ == LogicalType::Type::GEOMETRY) {
@@ -1193,7 +1231,13 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* va
       }
       geometry_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset,
                                          num_spaced_values, num_values, null_count);
+    } else {
+      SetMinMaxPair(comparator_->GetMinMaxSpaced(values, num_spaced_values, valid_bits,
+                                                 valid_bits_offset));
     }
+  } else {
+    SetMinMaxPair(comparator_->GetMinMaxSpaced(values, num_spaced_values, valid_bits,
+                                               valid_bits_offset));
   }
 }
 
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index 2a395da97a9..6e6aea932ce 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -159,6 +159,8 @@ class PARQUET_EXPORT GeometryStatistics {
                     int64_t valid_bits_offset, int64_t num_spaced_values,
                     int64_t num_values, int64_t null_count);
 
+  void Update(const ::arrow::Array& values, bool update_counts);
+
   EncodedGeometryStatistics Encode();
 
   bool is_valid() const;

From f782e309b4f80b2eb11f97ec8aa08fb19fb52df8 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Wed, 11 Sep 2024 20:20:32 +0800
Subject: [PATCH 152/157] Support covering

---
 cpp/src/parquet/column_writer_test.cc    | 23 +++++++++++++++++--
 cpp/src/parquet/geometry_util_internal.h | 29 ++++++++++++++++++++++++
 cpp/src/parquet/statistics.cc            | 19 ++++++++++++++--
 cpp/src/parquet/thrift_internal.h        | 10 ++++++++
 4 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index ee0cc30e97d..37c8116cc60 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -35,6 +35,7 @@
 #include "parquet/column_writer.h"
 #include "parquet/file_reader.h"
 #include "parquet/file_writer.h"
+#include "parquet/geometry_util_internal.h"
 #include "parquet/metadata.h"
 #include "parquet/platform.h"
 #include "parquet/properties.h"
@@ -1853,6 +1854,21 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     EXPECT_DOUBLE_EQ(100, geometry_statistics->GetYMax());
     EXPECT_FALSE(geometry_statistics->HasZ());
     EXPECT_FALSE(geometry_statistics->HasM());
+
+    auto coverings = geometry_statistics->GetCoverings();
+    EXPECT_EQ(1, coverings.size());
+    EXPECT_EQ("WKB", coverings[0].first);
+    geometry::WKBGeometryBounder bounder;
+    const std::string& wkb = coverings[0].second;
+    geometry::WKBBuffer wkb_buffer(reinterpret_cast<const uint8_t*>(wkb.data()),
+                                   wkb.size());
+    bounder.ReadGeometry(&wkb_buffer);
+    bounder.Flush();
+    auto bounds = bounder.Bounds();
+    EXPECT_DOUBLE_EQ(0, bounds.min[0]);
+    EXPECT_DOUBLE_EQ(1, bounds.min[1]);
+    EXPECT_DOUBLE_EQ(99, bounds.max[0]);
+    EXPECT_DOUBLE_EQ(100, bounds.max[1]);
   }
 
   void TestWriteAndReadSpaced(ParquetVersion::type version,
@@ -1875,8 +1891,10 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     }
 
     // Construct valid bits using definition levels
-    std::vector<uint8_t> valid_bytes =
-        std::vector<uint8_t>(definition_levels.begin(), definition_levels.end());
+    std::vector<uint8_t> valid_bytes(num_values);
+    std::transform(definition_levels.begin(), definition_levels.end(),
+                   valid_bytes.begin(),
+                   [&](int64_t level) { return static_cast<uint8_t>(level); });
     std::shared_ptr<Buffer> valid_bits;
     ASSERT_OK_AND_ASSIGN(valid_bits, ::arrow::internal::BytesToBits(valid_bytes));
 
@@ -1903,6 +1921,7 @@ class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
     }
 
     std::shared_ptr<Statistics> statistics = metadata_stats();
+    EXPECT_FALSE(statistics->HasMinMax());
     EXPECT_TRUE(statistics->HasGeometryStatistics());
     const GeometryStatistics* geometry_statistics = statistics->geometry_statistics();
     std::vector<int32_t> geometry_types = geometry_statistics->GetGeometryTypes();
diff --git a/cpp/src/parquet/geometry_util_internal.h b/cpp/src/parquet/geometry_util_internal.h
index 0ad02b073af..28ca4849e77 100644
--- a/cpp/src/parquet/geometry_util_internal.h
+++ b/cpp/src/parquet/geometry_util_internal.h
@@ -657,4 +657,33 @@ class WKBGeometryBounder {
   std::unordered_set<int32_t> geometry_types_;
 };
 
+inline std::string MakeCoveringWKBFromBound(double xmin, double xmax, double ymin,
+                                            double ymax) {
+  std::string wkb_data(93, 0);
+
+  // endianness and header
+  auto data = reinterpret_cast<uint8_t*>(wkb_data.data());
+  data[0] = ARROW_LITTLE_ENDIAN;
+  uint32_t wkb_type = 3;  // POLYGON
+  memcpy(&data[1], &wkb_type, 4);
+
+  // n_rings and n_coords
+  uint32_t n_rings = 1;
+  uint32_t n_coords = 5;
+  memcpy(&data[5], &n_rings, 4);
+  memcpy(&data[9], &n_coords, 4);
+
+  // coordinates
+  double coords[5][2] = {
+      {xmin, ymin}, {xmax, ymin}, {xmax, ymax}, {xmin, ymax}, {xmin, ymin}};
+  uint8_t* ptr = &data[13];
+  for (auto coord : coords) {
+    memcpy(ptr, &coord[0], 8);
+    memcpy(ptr + 8, &coord[1], 8);
+    ptr += 16;
+  }
+
+  return wkb_data;
+}
+
 }  // namespace parquet::geometry
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index 02074bfecfa..f2fbdf456d4 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -169,6 +169,16 @@ class GeometryStatisticsImpl {
     out.mmin = mins[3];
     out.mmax = maxes[3];
 
+    if (coverings_.empty()) {
+      // Generate coverings from bounding box if coverings is not present
+      std::string kind = "WKB";
+      std::string value =
+          geometry::MakeCoveringWKBFromBound(out.xmin, out.xmax, out.ymin, out.ymax);
+      out.coverings.emplace_back(kind, value);
+    } else {
+      out.coverings = coverings_;
+    }
+
     return out;
   }
 
@@ -195,6 +205,7 @@ class GeometryStatisticsImpl {
 
     bounder_.ReadBox(box);
     bounder_.ReadGeometryTypes(encoded.geometry_types);
+    coverings_ = encoded.coverings;
 
     try {
       for (const auto& covering : encoded.coverings) {
@@ -219,8 +230,13 @@ class GeometryStatisticsImpl {
 
   std::vector<int32_t> GetGeometryTypes() const { return bounder_.GeometryTypes(); }
 
+  std::vector<std::pair<std::string, std::string>> GetCoverings() const {
+    return coverings_;
+  }
+
  private:
   geometry::WKBGeometryBounder bounder_;
+  std::vector<std::pair<std::string, std::string>> coverings_;
   bool is_valid_ = true;
 };
 
@@ -322,8 +338,7 @@ std::vector<int32_t> GeometryStatistics::GetGeometryTypes() const {
 
 std::vector<std::pair<std::string, std::string>> GeometryStatistics::GetCoverings()
     const {
-  // TODO (kontinuation): support coverings
-  return {};
+  return impl_->GetCoverings();
 }
 
 namespace {
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index 92aba0ef30b..33d43221efe 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -366,6 +366,16 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
       bbox.__set_mmax(encoded_geometry_stats.mmax);
     }
     geometry_statistics.__set_bbox(bbox);
+
+    std::vector<format::Covering> coverings;
+    coverings.reserve(encoded_geometry_stats.coverings.size());
+    for (const auto& pair : encoded_geometry_stats.coverings) {
+      format::Covering covering;
+      covering.__set_kind(pair.first);
+      covering.__set_value(pair.second);
+      coverings.push_back(std::move(covering));
+    }
+    geometry_statistics.__set_coverings(coverings);
     statistics.__set_geometry_stats(geometry_statistics);
   }
 

From 9813f48a412a35631e1a605a34400d287f96ac2d Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Thu, 12 Sep 2024 10:27:50 +0800
Subject: [PATCH 153/157] MakeStatistics and Statistics::Make should not be a
 breaking change

---
 cpp/src/parquet/statistics.cc | 36 ++++++++++++++++++++++++-----------
 cpp/src/parquet/statistics.h  | 33 ++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index f2fbdf456d4..3f96afb6241 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -1419,12 +1419,13 @@ std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
               encoded_stats->has_geometry_statistics, pool);
 }
 
-std::shared_ptr<Statistics> Statistics::Make(
-    const ColumnDescriptor* descr, const std::string& encoded_min,
-    const std::string& encoded_max, int64_t num_values, int64_t null_count,
-    int64_t distinct_count, const EncodedGeometryStatistics& geometry_statistics,
-    bool has_min_max, bool has_null_count, bool has_distinct_count,
-    bool has_geometry_statistics, ::arrow::MemoryPool* pool) {
+std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
+                                             const std::string& encoded_min,
+                                             const std::string& encoded_max,
+                                             int64_t num_values, int64_t null_count,
+                                             int64_t distinct_count, bool has_min_max,
+                                             bool has_null_count, bool has_distinct_count,
+                                             ::arrow::MemoryPool* pool) {
 #define MAKE_STATS(CAP_TYPE, KLASS)                                              \
   case Type::CAP_TYPE:                                                           \
     return std::make_shared<TypedStatisticsImpl<KLASS>>(                         \
@@ -1437,12 +1438,8 @@ std::shared_ptr<Statistics> Statistics::Make(
     MAKE_STATS(INT64, Int64Type);
     MAKE_STATS(FLOAT, FloatType);
     MAKE_STATS(DOUBLE, DoubleType);
+    MAKE_STATS(BYTE_ARRAY, ByteArrayType);
     MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
-    case Type::BYTE_ARRAY:
-      return std::make_shared<TypedStatisticsImpl<ByteArrayType>>(
-          descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
-          geometry_statistics, has_min_max, has_null_count, has_distinct_count,
-          has_geometry_statistics, pool);
     default:
       break;
   }
@@ -1451,4 +1448,21 @@ std::shared_ptr<Statistics> Statistics::Make(
   return nullptr;
 }
 
+std::shared_ptr<Statistics> Statistics::Make(
+    const ColumnDescriptor* descr, const std::string& encoded_min,
+    const std::string& encoded_max, int64_t num_values, int64_t null_count,
+    int64_t distinct_count, const EncodedGeometryStatistics& geometry_statistics,
+    bool has_min_max, bool has_null_count, bool has_distinct_count,
+    bool has_geometry_statistics, ::arrow::MemoryPool* pool) {
+  if (descr->physical_type() == Type::BYTE_ARRAY) {
+    return std::make_shared<TypedStatisticsImpl<ByteArrayType>>(
+        descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
+        geometry_statistics, has_min_max, has_null_count, has_distinct_count,
+        has_geometry_statistics, pool);
+  } else {
+    return Make(descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
+                has_min_max, has_null_count, has_distinct_count, pool);
+  }
+}
+
 }  // namespace parquet
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index 6e6aea932ce..fc8d914bbc2 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -304,6 +304,27 @@ class PARQUET_EXPORT Statistics {
   /// \param[in] has_null_count whether the null_count statistics are set
   /// \param[in] has_distinct_count whether the distinct_count statistics are set
   /// \param[in] pool a memory pool to use for any memory allocations, optional
+  static std::shared_ptr<Statistics> Make(
+      const ColumnDescriptor* descr, const std::string& encoded_min,
+      const std::string& encoded_max, int64_t num_values, int64_t null_count,
+      int64_t distinct_count, bool has_min_max, bool has_null_count,
+      bool has_distinct_count,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+  /// \brief Create a new statistics instance given a column schema
+  /// definition and preexisting state
+  /// \param[in] descr the column schema
+  /// \param[in] encoded_min the encoded minimum value
+  /// \param[in] encoded_max the encoded maximum value
+  /// \param[in] num_values total number of values
+  /// \param[in] null_count number of null values
+  /// \param[in] distinct_count number of distinct values
+  /// \param[in] geometry_statistics the geometry statistics
+  /// \param[in] has_min_max whether the min/max statistics are set
+  /// \param[in] has_null_count whether the null_count statistics are set
+  /// \param[in] has_distinct_count whether the distinct_count statistics are set
+  /// \param[in] has_geometry_statistics whether the geometry statistics are set
+  /// \param[in] pool a memory pool to use for any memory allocations, optional
   static std::shared_ptr<Statistics> Make(
       const ColumnDescriptor* descr, const std::string& encoded_min,
       const std::string& encoded_max, int64_t num_values, int64_t null_count,
@@ -460,6 +481,18 @@ std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_t
       DType::type_num, &min, &max, num_values, null_count, distinct_count));
 }
 
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+    const ColumnDescriptor* descr, const std::string& encoded_min,
+    const std::string& encoded_max, int64_t num_values, int64_t null_count,
+    int64_t distinct_count, bool has_min_max, bool has_null_count,
+    bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+  return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
+      descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
+      has_min_max, has_null_count, has_distinct_count, pool));
+}
+
 /// \brief Typed version of Statistics::Make
 template <typename DType>
 std::shared_ptr<TypedStatistics<DType>> MakeStatistics(

From c56133c7573092aa074d7822e19d8e06dd0977bf Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Thu, 12 Sep 2024 19:03:29 +0800
Subject: [PATCH 154/157] ColumnIndex, as well as some other fixes and
 refacturings

---
 cpp/src/parquet/CMakeLists.txt                |   3 +-
 cpp/src/parquet/column_reader.cc              |   4 +
 cpp/src/parquet/geometry_util_internal.h      |   4 +-
 .../parquet/geometry_util_internal_test.cc    |  16 ++
 cpp/src/parquet/metadata.cc                   |  37 +--
 cpp/src/parquet/page_index.cc                 | 102 ++++++--
 cpp/src/parquet/page_index.h                  |  15 ++
 cpp/src/parquet/page_index_test.cc            |  56 +++-
 cpp/src/parquet/reader_test.cc                | 244 ++++++++++++------
 cpp/src/parquet/statistics.cc                 |  13 +
 cpp/src/parquet/statistics.h                  |   3 +
 cpp/src/parquet/thrift_internal.h             |  90 +++++--
 12 files changed, 424 insertions(+), 163 deletions(-)

diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index a6e92eabceb..f5d9a77dfb5 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -375,12 +375,11 @@ add_parquet_test(internals-test
                  statistics_test.cc
                  encoding_test.cc
                  metadata_test.cc
+                 geometry_util_internal_test.cc
                  page_index_test.cc
                  public_api_test.cc
                  types_test.cc)
 
-add_parquet_test(geometry-test SOURCES geometry_util_internal_test.cc)
-
 set_source_files_properties(public_api_test.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON
                                                           SKIP_UNITY_BUILD_INCLUSION ON)
 
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 60a8a2176b0..e982675ed39 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -243,6 +243,10 @@ EncodedStatistics ExtractStatsFromHeader(const H& header) {
   if (stats.__isset.distinct_count) {
     page_statistics.set_distinct_count(stats.distinct_count);
   }
+  if (stats.__isset.geometry_stats) {
+    page_statistics.set_geometry(
+        FromThrift(stats.geometry_stats, stats.__isset.geometry_stats));
+  }
   return page_statistics;
 }
 
diff --git a/cpp/src/parquet/geometry_util_internal.h b/cpp/src/parquet/geometry_util_internal.h
index 28ca4849e77..a1dd9bb6da7 100644
--- a/cpp/src/parquet/geometry_util_internal.h
+++ b/cpp/src/parquet/geometry_util_internal.h
@@ -657,8 +657,8 @@ class WKBGeometryBounder {
   std::unordered_set<int32_t> geometry_types_;
 };
 
-inline std::string MakeCoveringWKBFromBound(double xmin, double xmax, double ymin,
-                                            double ymax) {
+static inline std::string MakeCoveringWKBFromBound(double xmin, double xmax, double ymin,
+                                                   double ymax) {
   std::string wkb_data(93, 0);
 
   // endianness and header
diff --git a/cpp/src/parquet/geometry_util_internal_test.cc b/cpp/src/parquet/geometry_util_internal_test.cc
index 4d4c907f37e..3763e7ff5de 100644
--- a/cpp/src/parquet/geometry_util_internal_test.cc
+++ b/cpp/src/parquet/geometry_util_internal_test.cc
@@ -17,6 +17,7 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <cstring>
 
 #include "arrow/testing/gtest_compat.h"
 
@@ -446,4 +447,19 @@ INSTANTIATE_TEST_SUITE_P(
                      0x00, 0x00, 0x00, 0xc0, 0x72, 0x40},
                     {30, 10, 40, 300, 30, 10, 40, 300})));
 
+TEST(TestGeometryUtil, MakeCoveringWKBFromBound) {
+  std::string wkb_covering = MakeCoveringWKBFromBound(10, 20, 30, 40);
+  // POLYGON ((10 30, 20 30, 20 40, 10 40, 10 30))
+  std::vector<uint8_t> expected_wkb = {
+      0x01, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x24, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3e,
+      0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x34, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x3e, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x34, 0x40, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x44, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x24, 0x40, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x44, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x24,
+      0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3e, 0x40};
+  EXPECT_EQ(expected_wkb.size(), wkb_covering.size());
+  EXPECT_EQ(0, memcmp(wkb_covering.data(), expected_wkb.data(), expected_wkb.size()));
+}
+
 }  // namespace parquet::geometry
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index b58a6b0e6f5..e6ecf3a8664 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -88,37 +88,6 @@ std::string ParquetVersionToString(ParquetVersion::type ver) {
   return "UNKNOWN";
 }
 
-static EncodedGeometryStatistics MakeEncodedGeometryStatistics(
-    const format::Statistics& stats) {
-  EncodedGeometryStatistics out;
-
-  if (stats.__isset.geometry_stats) {
-    const format::GeometryStatistics& geom_stats = stats.geometry_stats;
-    out.geometry_types = geom_stats.geometry_types;
-
-    out.xmin = geom_stats.bbox.xmin;
-    out.xmax = geom_stats.bbox.xmax;
-    out.ymin = geom_stats.bbox.ymin;
-    out.ymax = geom_stats.bbox.ymax;
-
-    if (geom_stats.bbox.__isset.zmin && geom_stats.bbox.__isset.zmax) {
-      out.zmin = geom_stats.bbox.zmin;
-      out.zmax = geom_stats.bbox.zmax;
-    }
-
-    if (geom_stats.bbox.__isset.mmin && geom_stats.bbox.__isset.mmax) {
-      out.mmin = geom_stats.bbox.mmin;
-      out.mmax = geom_stats.bbox.mmax;
-    }
-
-    for (const auto& covering : geom_stats.coverings) {
-      out.coverings.emplace_back(covering.kind, covering.value);
-    }
-  }
-
-  return out;
-}
-
 template <typename DType>
 static std::shared_ptr<Statistics> MakeTypedColumnStats(
     const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
@@ -128,7 +97,8 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
         descr, metadata.statistics.min_value, metadata.statistics.max_value,
         metadata.num_values - metadata.statistics.null_count,
         metadata.statistics.null_count, metadata.statistics.distinct_count,
-        MakeEncodedGeometryStatistics(metadata.statistics),
+        FromThrift(metadata.statistics.geometry_stats,
+                   metadata.statistics.__isset.geometry_stats),
         metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value,
         metadata.statistics.__isset.null_count,
         metadata.statistics.__isset.distinct_count,
@@ -139,7 +109,8 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
       descr, metadata.statistics.min, metadata.statistics.max,
       metadata.num_values - metadata.statistics.null_count,
       metadata.statistics.null_count, metadata.statistics.distinct_count,
-      MakeEncodedGeometryStatistics(metadata.statistics),
+      FromThrift(metadata.statistics.geometry_stats,
+                 metadata.statistics.__isset.geometry_stats),
       metadata.statistics.__isset.max && metadata.statistics.__isset.min,
       metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count,
       metadata.statistics.__isset.geometry_stats);
diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc
index afda4c6064b..cb9ebdd4296 100644
--- a/cpp/src/parquet/page_index.cc
+++ b/cpp/src/parquet/page_index.cc
@@ -94,12 +94,16 @@ class TypedColumnIndexImpl : public TypedColumnIndex<DType> {
   TypedColumnIndexImpl(const ColumnDescriptor& descr, format::ColumnIndex column_index)
       : column_index_(std::move(column_index)) {
     // Make sure the number of pages is valid and it does not overflow to int32_t.
+    bool is_geometry =
+        (descr.logical_type() != nullptr && descr.logical_type()->is_geometry());
     const size_t num_pages = column_index_.null_pages.size();
     if (num_pages >= static_cast<size_t>(std::numeric_limits<int32_t>::max()) ||
-        column_index_.min_values.size() != num_pages ||
-        column_index_.max_values.size() != num_pages ||
+        (!is_geometry && (column_index_.min_values.size() != num_pages ||
+                          column_index_.max_values.size() != num_pages)) ||
         (column_index_.__isset.null_counts &&
-         column_index_.null_counts.size() != num_pages)) {
+         column_index_.null_counts.size() != num_pages) ||
+        (column_index_.__isset.geometry_stats &&
+         column_index_.geometry_stats.size() != num_pages)) {
       throw ParquetException("Invalid column index");
     }
 
@@ -110,23 +114,49 @@ class TypedColumnIndexImpl : public TypedColumnIndex<DType> {
         }));
     DCHECK_LE(num_non_null_pages, num_pages);
 
-    // Allocate slots for decoded values.
-    min_values_.resize(num_pages);
-    max_values_.resize(num_pages);
     non_null_page_indices_.reserve(num_non_null_pages);
-
-    // Decode min and max values according to the physical type.
-    // Note that null page are skipped.
-    auto plain_decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, &descr);
     for (size_t i = 0; i < num_pages; ++i) {
       if (!column_index_.null_pages[i]) {
-        // The check on `num_pages` has guaranteed the cast below is safe.
         non_null_page_indices_.emplace_back(static_cast<int32_t>(i));
-        Decode<DType>(plain_decoder, column_index_.min_values[i], &min_values_, i);
-        Decode<DType>(plain_decoder, column_index_.max_values[i], &max_values_, i);
       }
     }
     DCHECK_EQ(num_non_null_pages, non_null_page_indices_.size());
+
+    if (!is_geometry) {
+      // Allocate slots for decoded values.
+      min_values_.resize(num_pages);
+      max_values_.resize(num_pages);
+
+      // Decode min and max values according to the physical type.
+      // Note that null page are skipped.
+      auto plain_decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, &descr);
+      for (size_t i = 0; i < num_pages; ++i) {
+        if (!column_index_.null_pages[i]) {
+          // The check on `num_pages` has guaranteed the cast below is safe.
+          Decode<DType>(plain_decoder, column_index_.min_values[i], &min_values_, i);
+          Decode<DType>(plain_decoder, column_index_.max_values[i], &max_values_, i);
+        }
+      }
+    } else {
+      // Decode geometry statistics.
+      // Note that null pages are skipped.
+      if (column_index_.__isset.geometry_stats) {
+        encoded_geometry_statistics_.resize(num_pages);
+        for (size_t i = 0; i < num_pages; ++i) {
+          if (!column_index_.null_pages[i]) {
+            encoded_geometry_statistics_[i] =
+                FromThrift(column_index_.geometry_stats[i], true);
+          }
+        }
+
+        geometry_statistics_.reserve(num_pages);
+        for (const auto& encoded_geom_stat : encoded_geometry_statistics_) {
+          GeometryStatistics geom_stat;
+          geom_stat.Decode(encoded_geom_stat);
+          geometry_statistics_.push_back(std::move(geom_stat));
+        }
+      }
+    }
   }
 
   const std::vector<bool>& null_pages() const override {
@@ -159,6 +189,15 @@ class TypedColumnIndexImpl : public TypedColumnIndex<DType> {
 
   const std::vector<T>& max_values() const override { return max_values_; }
 
+  const std::vector<EncodedGeometryStatistics>& encoded_geometry_statistics()
+      const override {
+    return encoded_geometry_statistics_;
+  }
+
+  const std::vector<GeometryStatistics>& geometry_statistics() const override {
+    return geometry_statistics_;
+  }
+
  private:
   /// Wrapped thrift column index.
   const format::ColumnIndex column_index_;
@@ -167,6 +206,10 @@ class TypedColumnIndexImpl : public TypedColumnIndex<DType> {
   std::vector<T> max_values_;
   /// A list of page indices for non-null pages.
   std::vector<int32_t> non_null_page_indices_;
+  /// A list of encoded geometry statistics
+  std::vector<EncodedGeometryStatistics> encoded_geometry_statistics_;
+  /// A list of geometry statistics
+  std::vector<GeometryStatistics> geometry_statistics_;
 };
 
 class OffsetIndexImpl : public OffsetIndex {
@@ -474,17 +517,29 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder {
       column_index_.null_pages.emplace_back(true);
       column_index_.min_values.emplace_back("");
       column_index_.max_values.emplace_back("");
-    } else if (stats.has_min && stats.has_max) {
-      const size_t page_ordinal = column_index_.null_pages.size();
-      non_null_page_indices_.emplace_back(page_ordinal);
-      column_index_.min_values.emplace_back(stats.min());
-      column_index_.max_values.emplace_back(stats.max());
-      column_index_.null_pages.emplace_back(false);
     } else {
-      /// This is a non-null page but it lacks of meaningful min/max values.
-      /// Discard the column index.
-      state_ = BuilderState::kDiscarded;
-      return;
+      bool discard = true;
+      if (stats.has_min && stats.has_max) {
+        const size_t page_ordinal = column_index_.null_pages.size();
+        non_null_page_indices_.emplace_back(page_ordinal);
+        column_index_.min_values.emplace_back(stats.min());
+        column_index_.max_values.emplace_back(stats.max());
+        discard = false;
+      }
+      if (stats.has_geometry_statistics) {
+        column_index_.__isset.geometry_stats = true;
+        column_index_.geometry_stats.emplace_back(ToThrift(stats.geometry_statistics()));
+        discard = false;
+      }
+
+      if (!discard) {
+        column_index_.null_pages.emplace_back(false);
+      } else {
+        /// This is a non-null page but it lacks of meaningful min/max values
+        /// or geometry statistics. Discard the column index.
+        state_ = BuilderState::kDiscarded;
+        return;
+      }
     }
 
     if (column_index_.__isset.null_counts && stats.has_null_count) {
@@ -897,6 +952,7 @@ std::unique_ptr<ColumnIndex> ColumnIndex::Make(const ColumnDescriptor& descr,
     case Type::BYTE_ARRAY:
       return std::make_unique<TypedColumnIndexImpl<ByteArrayType>>(
           descr, std::move(column_index));
+
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_unique<TypedColumnIndexImpl<FLBAType>>(descr,
                                                               std::move(column_index));
diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h
index d45c59cab22..0866a06fe15 100644
--- a/cpp/src/parquet/page_index.h
+++ b/cpp/src/parquet/page_index.h
@@ -27,6 +27,8 @@
 namespace parquet {
 
 class EncodedStatistics;
+class EncodedGeometryStatistics;
+class GeometryStatistics;
 struct PageIndexLocation;
 
 /// \brief ColumnIndex is a proxy around format::ColumnIndex.
@@ -76,6 +78,13 @@ class PARQUET_EXPORT ColumnIndex {
 
   /// \brief A vector of page indices for non-null pages.
   virtual const std::vector<int32_t>& non_null_page_indices() const = 0;
+
+  /// \brief A vector of encoded geometry statistics for each data page in this column.
+  ///
+  /// `null_pages` should be inspected first, as only pages with non-null values
+  /// may have their upper bounds populated.
+  virtual const std::vector<EncodedGeometryStatistics>& encoded_geometry_statistics()
+      const = 0;
 };
 
 /// \brief Typed implementation of ColumnIndex.
@@ -96,6 +105,12 @@ class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex {
   ///
   /// Just like `min_values`, but for upper bounds instead of lower bounds.
   virtual const std::vector<T>& max_values() const = 0;
+
+  /// \brief A vector of geometry statistics for each data page in this column.
+  ///
+  /// This is like `min_values` and `max_values`, but for geometry statistics
+  /// instead of lower/upper bounds
+  virtual const std::vector<GeometryStatistics>& geometry_statistics() const = 0;
 };
 
 using BoolColumnIndex = TypedColumnIndex<BooleanType>;
diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc
index 4db49b42674..ea4396a15eb 100644
--- a/cpp/src/parquet/page_index_test.cc
+++ b/cpp/src/parquet/page_index_test.cc
@@ -23,10 +23,13 @@
 #include "arrow/io/file.h"
 #include "arrow/util/float16.h"
 #include "parquet/file_reader.h"
+#include "parquet/geometry_util_internal.h"
 #include "parquet/metadata.h"
 #include "parquet/schema.h"
+#include "parquet/statistics.h"
 #include "parquet/test_util.h"
 #include "parquet/thrift_internal.h"
+#include "parquet/types.h"
 
 namespace parquet {
 
@@ -459,6 +462,8 @@ void TestWriteTypedColumnIndex(schema::NodePtr node,
                                const std::vector<EncodedStatistics>& page_stats,
                                BoundaryOrder::type boundary_order, bool has_null_counts) {
   auto descr = std::make_unique<ColumnDescriptor>(node, /*max_definition_level=*/1, 0);
+  bool is_geometry =
+      (descr->logical_type() != nullptr && descr->logical_type()->is_geometry());
 
   auto builder = ColumnIndexBuilder::Make(descr.get());
   for (const auto& stats : page_stats) {
@@ -484,11 +489,27 @@ void TestWriteTypedColumnIndex(schema::NodePtr node,
     const size_t num_pages = column_index->null_pages().size();
     for (size_t i = 0; i < num_pages; ++i) {
       ASSERT_EQ(page_stats[i].all_null_value, column_index->null_pages()[i]);
-      ASSERT_EQ(page_stats[i].min(), column_index->encoded_min_values()[i]);
-      ASSERT_EQ(page_stats[i].max(), column_index->encoded_max_values()[i]);
+      if (!is_geometry) {
+        ASSERT_EQ(page_stats[i].min(), column_index->encoded_min_values()[i]);
+        ASSERT_EQ(page_stats[i].max(), column_index->encoded_max_values()[i]);
+      }
       if (has_null_counts) {
         ASSERT_EQ(page_stats[i].null_count, column_index->null_counts()[i]);
       }
+      if (page_stats[i].has_geometry_statistics) {
+        const auto& expected_stats = page_stats[i].geometry_statistics();
+        const auto& actual_stats = column_index->encoded_geometry_statistics()[i];
+        ASSERT_EQ(expected_stats.geometry_types, actual_stats.geometry_types);
+        ASSERT_EQ(expected_stats.coverings, actual_stats.coverings);
+        ASSERT_DOUBLE_EQ(expected_stats.xmin, actual_stats.xmin);
+        ASSERT_DOUBLE_EQ(expected_stats.xmax, actual_stats.xmax);
+        ASSERT_DOUBLE_EQ(expected_stats.ymin, actual_stats.ymin);
+        ASSERT_DOUBLE_EQ(expected_stats.ymax, actual_stats.ymax);
+        ASSERT_DOUBLE_EQ(expected_stats.zmin, actual_stats.zmin);
+        ASSERT_DOUBLE_EQ(expected_stats.zmax, actual_stats.zmax);
+        ASSERT_DOUBLE_EQ(expected_stats.mmin, actual_stats.mmin);
+        ASSERT_DOUBLE_EQ(expected_stats.mmax, actual_stats.mmax);
+      }
     }
   }
 }
@@ -601,6 +622,37 @@ TEST(PageIndex, WriteFloat16ColumnIndex) {
                             /*has_null_counts=*/false);
 }
 
+TEST(PageIndex, WriteGeometryColumnIndex) {
+  std::vector<EncodedStatistics> page_stats(3);
+
+  EncodedGeometryStatistics geom_stats[3];
+  for (int i = 0; i < 3; i++) {
+    geom_stats[i].xmin = i + 1;
+    geom_stats[i].xmax = i + 2;
+    geom_stats[i].ymin = i + 3;
+    geom_stats[i].ymax = i + 4;
+    geom_stats[i].zmin = i + 5;
+    geom_stats[i].zmax = i + 6;
+    geom_stats[i].mmin = i + 7;
+    geom_stats[i].mmax = i + 8;
+    geom_stats[i].geometry_types = {i + 1};
+    std::string covering = geometry::MakeCoveringWKBFromBound(
+        geom_stats[i].xmin, geom_stats[i].xmax, geom_stats[i].ymin, geom_stats[i].ymax);
+    geom_stats[i].coverings = {{"WKB", covering}};
+    page_stats.at(i).set_geometry(geom_stats[i]);
+  }
+
+  schema::NodePtr node = schema::PrimitiveNode::Make(
+      "c1", Repetition::OPTIONAL,
+      GeometryLogicalType::Make(R"({"id": {"authority": "OGC", "code": "CRS84"}})",
+                                LogicalType::GeometryEdges::PLANAR,
+                                LogicalType::GeometryEncoding::WKB, "metadata0"),
+      Type::BYTE_ARRAY);
+
+  TestWriteTypedColumnIndex(node, page_stats, BoundaryOrder::Unordered,
+                            /*has_null_counts=*/false);
+}
+
 TEST(PageIndex, WriteColumnIndexWithAllNullPages) {
   // All values are null.
   std::vector<EncodedStatistics> page_stats(3);
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index c5af6ae6368..887ebdb4a52 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1817,63 +1817,125 @@ TEST(PageIndexReaderTest, ReadFileWithoutPageIndex) {
   ASSERT_EQ(nullptr, row_group_index_reader);
 }
 
-TEST(TestFileReader, GeometryLogicalType) {
-  const int num_rows = 1000;
-
-  // Make schema
-  schema::NodeVector fields;
-  fields.push_back(PrimitiveNode::Make(
-      "g", Repetition::REQUIRED,
-      GeometryLogicalType::Make(R"({"id": {"authority": "OGC", "code": "CRS84"}})",
-                                LogicalType::GeometryEdges::PLANAR,
-                                LogicalType::GeometryEncoding::WKB, "metadata0"),
-      Type::BYTE_ARRAY));
-  auto schema = std::static_pointer_cast<GroupNode>(
-      GroupNode::Make("schema", Repetition::REQUIRED, fields));
+class TestGeometryLogicalType : public ::testing::Test {
+ public:
+  const int NUM_ROWS = 1000;
+
+  void WriteTestData(ParquetDataPageVersion data_page_version,
+                     bool enable_write_page_index) {
+    // Make schema
+    schema::NodeVector fields;
+    fields.push_back(PrimitiveNode::Make(
+        "g", Repetition::REQUIRED,
+        GeometryLogicalType::Make(R"({"id": {"authority": "OGC", "code": "CRS84"}})",
+                                  LogicalType::GeometryEdges::PLANAR,
+                                  LogicalType::GeometryEncoding::WKB, "metadata0"),
+        Type::BYTE_ARRAY));
+    auto schema = std::static_pointer_cast<GroupNode>(
+        GroupNode::Make("schema", Repetition::REQUIRED, fields));
+
+    // Write small batches and small data pages
+
+    auto writer_props_builder = WriterProperties::Builder()
+                                    .write_batch_size(64)
+                                    ->data_pagesize(128)
+                                    ->data_page_version(data_page_version);
+    if (enable_write_page_index) {
+      writer_props_builder->enable_write_page_index();
+    }
 
-  // Write small batches and small data pages
-  std::shared_ptr<WriterProperties> writer_props =
-      WriterProperties::Builder().write_batch_size(64)->data_pagesize(128)->build();
+    std::shared_ptr<WriterProperties> writer_props = writer_props_builder->build();
+
+    ASSERT_OK_AND_ASSIGN(auto out_file, ::arrow::io::BufferOutputStream::Create());
+    std::shared_ptr<ParquetFileWriter> file_writer =
+        ParquetFileWriter::Open(out_file, schema, writer_props);
+    RowGroupWriter* rg_writer = file_writer->AppendRowGroup();
+
+    // write WKB points to columns
+    auto* writer = static_cast<ByteArrayWriter*>(rg_writer->NextColumn());
+    std::vector<uint8_t> buffer(test::WKB_POINT_SIZE * NUM_ROWS);
+    uint8_t* ptr = buffer.data();
+    std::vector<ByteArray> values(NUM_ROWS);
+    for (int k = 0; k < NUM_ROWS; k++) {
+      test::GenerateWKBPoint(ptr, k, k + 1);
+      values[k].len = test::WKB_POINT_SIZE;
+      values[k].ptr = ptr;
+      ptr += test::WKB_POINT_SIZE;
+    }
+    writer->WriteBatch(NUM_ROWS, nullptr, nullptr, values.data());
 
-  ASSERT_OK_AND_ASSIGN(auto out_file, ::arrow::io::BufferOutputStream::Create());
-  std::shared_ptr<ParquetFileWriter> file_writer =
-      ParquetFileWriter::Open(out_file, schema, writer_props);
-  RowGroupWriter* rg_writer = file_writer->AppendRowGroup();
+    rg_writer->Close();
+    file_writer->Close();
 
-  // write WKB points to columns
-  auto* writer = static_cast<ByteArrayWriter*>(rg_writer->NextColumn());
-  std::vector<uint8_t> buffer(test::WKB_POINT_SIZE * num_rows);
-  uint8_t* ptr = buffer.data();
-  std::vector<ByteArray> values(num_rows);
-  for (int k = 0; k < num_rows; k++) {
-    test::GenerateWKBPoint(ptr, k, k + 1);
-    values[k].len = test::WKB_POINT_SIZE;
-    values[k].ptr = ptr;
-    ptr += test::WKB_POINT_SIZE;
+    ASSERT_OK_AND_ASSIGN(file_buf, out_file->Finish());
   }
-  writer->WriteBatch(num_rows, nullptr, nullptr, values.data());
 
-  rg_writer->Close();
-  file_writer->Close();
+  void TestWriteAndRead(ParquetDataPageVersion data_page_version,
+                        bool enable_write_page_index) {
+    WriteTestData(data_page_version, enable_write_page_index);
+
+    auto in_file = std::make_shared<::arrow::io::BufferReader>(file_buf);
+
+    ReaderProperties reader_props;
+    reader_props.enable_buffered_stream();
+    reader_props.set_buffer_size(64);
+    auto file_reader = ParquetFileReader::Open(in_file, reader_props);
+
+    // Check that the geometry statistics are correctly written and read
+    auto metadata = file_reader->metadata();
+    auto page_index_reader = file_reader->GetPageIndexReader();
+    int num_row_groups = metadata->num_row_groups();
+    for (int i = 0; i < num_row_groups; i++) {
+      auto row_group_metadata = metadata->RowGroup(i);
+      auto column_chunk_metadata = row_group_metadata->ColumnChunk(0);
+      auto statistics = column_chunk_metadata->statistics();
+      CheckStatistics(statistics);
+
+      if (enable_write_page_index) {
+        // Check column index
+        auto row_group_index_reader = page_index_reader->RowGroup(i);
+        auto column_index = row_group_index_reader->GetColumnIndex(0);
+        auto geometry_column_index =
+            std::static_pointer_cast<ByteArrayColumnIndex>(column_index);
+        CheckColumnIndex(geometry_column_index);
+      } else {
+        // Check per-page statistics
+        auto row_group_reader = file_reader->RowGroup(i);
+        auto page_reader = row_group_reader->GetColumnPageReader(0);
+        CheckPageStatistics(page_reader.get());
+      }
+    }
 
-  // Open the reader
-  ASSERT_OK_AND_ASSIGN(auto file_buf, out_file->Finish());
-  auto in_file = std::make_shared<::arrow::io::BufferReader>(file_buf);
+    // Check the geometry values
+    auto row_group = file_reader->RowGroup(0);
+    std::shared_ptr<ByteArrayReader> reader =
+        std::static_pointer_cast<ByteArrayReader>(row_group->Column(0));
+    int64_t total_values_read = 0;
+    while (total_values_read < NUM_ROWS) {
+      std::vector<ByteArray> out(NUM_ROWS);
+      int64_t values_read = 0;
+      int64_t levels_read =
+          reader->ReadBatch(NUM_ROWS, nullptr, nullptr, out.data(), &values_read);
+      ASSERT_GE(levels_read, 1);
+      ASSERT_GE(values_read, 1);
+
+      // Check the batch
+      for (int64_t i = 0; i < values_read; i++) {
+        const ByteArray& value = out[i];
+        double x = 0;
+        double y = 0;
+        EXPECT_TRUE(test::GetWKBPointCoordinate(value, &x, &y));
+        auto expected_x = static_cast<double>(i + total_values_read);
+        auto expected_y = static_cast<double>(i + 1 + total_values_read);
+        EXPECT_DOUBLE_EQ(expected_x, x);
+        EXPECT_DOUBLE_EQ(expected_y, y);
+      }
 
-  ReaderProperties reader_props;
-  reader_props.enable_buffered_stream();
-  reader_props.set_buffer_size(64);
-  std::unique_ptr<ParquetFileReader> file_reader =
-      ParquetFileReader::Open(in_file, reader_props);
+      total_values_read += values_read;
+    }
+  }
 
-  // Check that the geometry statistics are correctly written and read
-  std::shared_ptr<FileMetaData> metadata = file_reader->metadata();
-  int num_row_groups = metadata->num_row_groups();
-  for (int i = 0; i < num_row_groups; i++) {
-    std::unique_ptr<RowGroupMetaData> row_group_metadata = metadata->RowGroup(i);
-    std::unique_ptr<ColumnChunkMetaData> column_chunk_metadata =
-        row_group_metadata->ColumnChunk(0);
-    std::shared_ptr<Statistics> statistics = column_chunk_metadata->statistics();
+  void CheckStatistics(std::shared_ptr<Statistics> statistics) {
     EXPECT_FALSE(statistics->HasMinMax());
     EXPECT_TRUE(statistics->HasGeometryStatistics());
     const GeometryStatistics* geom_stats = statistics->geometry_statistics();
@@ -1886,34 +1948,70 @@ TEST(TestFileReader, GeometryLogicalType) {
     EXPECT_GT(geom_stats->GetYMax(), geom_stats->GetYMin());
     EXPECT_FALSE(geom_stats->HasZ());
     EXPECT_FALSE(geom_stats->HasM());
+    EXPECT_EQ(1, geom_stats->GetCoverings().size());
+    EXPECT_EQ("WKB", geom_stats->GetCoverings().front().first);
   }
 
-  // Check the geometry values
-  auto row_group = file_reader->RowGroup(0);
-  std::shared_ptr<ByteArrayReader> reader =
-      std::static_pointer_cast<ByteArrayReader>(row_group->Column(0));
-  int64_t total_values_read = 0;
-  while (total_values_read < num_rows) {
-    std::vector<ByteArray> out(num_rows);
-    int64_t values_read = 0;
-    int64_t levels_read =
-        reader->ReadBatch(num_rows, nullptr, nullptr, out.data(), &values_read);
-    ASSERT_GE(levels_read, 1);
-    ASSERT_GE(values_read, 1);
-
-    // Check the batch
-    for (int64_t i = 0; i < values_read; i++) {
-      const ByteArray& value = out[i];
-      double x = 0;
-      double y = 0;
-      EXPECT_TRUE(test::GetWKBPointCoordinate(value, &x, &y));
-      auto expected_x = static_cast<double>(i + total_values_read);
-      auto expected_y = static_cast<double>(i + 1 + total_values_read);
-      EXPECT_DOUBLE_EQ(expected_x, x);
-      EXPECT_DOUBLE_EQ(expected_y, y);
+  void CheckColumnIndex(std::shared_ptr<ByteArrayColumnIndex> geometry_column_index) {
+    EXPECT_FALSE(geometry_column_index->geometry_statistics().empty());
+    double last_xmin = -geometry::kInf;
+    double last_ymin = -geometry::kInf;
+    for (const auto& geom_stats : geometry_column_index->geometry_statistics()) {
+      std::vector<int32_t> geometry_types = geom_stats.GetGeometryTypes();
+      EXPECT_EQ(1, geometry_types.size());
+      EXPECT_EQ(1, geometry_types[0]);
+      EXPECT_GE(geom_stats.GetXMin(), last_xmin);
+      EXPECT_GT(geom_stats.GetXMax(), geom_stats.GetXMin());
+      EXPECT_GT(geom_stats.GetYMin(), last_ymin);
+      EXPECT_GT(geom_stats.GetYMax(), geom_stats.GetYMin());
+      EXPECT_FALSE(geom_stats.HasZ());
+      EXPECT_FALSE(geom_stats.HasM());
+      EXPECT_EQ(1, geom_stats.GetCoverings().size());
+      EXPECT_EQ("WKB", geom_stats.GetCoverings().front().first);
+      last_xmin = geom_stats.GetXMin();
+      last_ymin = geom_stats.GetYMin();
     }
+  }
+
+  void CheckPageStatistics(PageReader* page_reader) {
+    while (true) {
+      auto page = page_reader->NextPage();
+      if (!page) {
+        break;  // No more pages
+      }
+      // Check if the page has statistics
+      if (page->type() == parquet::PageType::DATA_PAGE ||
+          page->type() == parquet::PageType::DATA_PAGE_V2) {
+        std::shared_ptr<parquet::DataPage> data_page =
+            std::static_pointer_cast<parquet::DataPage>(page);
+        const EncodedStatistics& statistics = data_page->statistics();
+        EXPECT_TRUE(statistics.has_geometry_statistics);
+        EncodedGeometryStatistics geom_stats = statistics.geometry_statistics();
+        EXPECT_EQ(1, geom_stats.geometry_types.size());
+        EXPECT_EQ(1, geom_stats.coverings.size());
+        EXPECT_GE(geom_stats.xmin, 0);
+        EXPECT_GT(geom_stats.xmax, geom_stats.xmin);
+        EXPECT_GT(geom_stats.ymin, 0);
+        EXPECT_GT(geom_stats.ymax, geom_stats.ymin);
+      }
+    }
+  }
+
+ protected:
+  std::shared_ptr<Buffer> file_buf;
+};
+
+TEST_F(TestGeometryLogicalType, TestWriteAndReadWithPageStatistics) {
+  for (auto data_page_version :
+       {ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) {
+    TestWriteAndRead(data_page_version, false);
+  }
+}
 
-    total_values_read += values_read;
+TEST_F(TestGeometryLogicalType, TestWriteAndReadWithColumnIndex) {
+  for (auto data_page_version :
+       {ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) {
+    TestWriteAndRead(data_page_version, true);
   }
 }
 
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index 3f96afb6241..65346a487c5 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -153,6 +153,12 @@ class GeometryStatisticsImpl {
     }
   }
 
+  void Reset() {
+    bounder_.Reset();
+    coverings_.clear();
+    is_valid_ = true;
+  }
+
   EncodedGeometryStatistics Encode() const {
     const double* mins = bounder_.Bounds().min;
     const double* maxes = bounder_.Bounds().max;
@@ -247,6 +253,8 @@ GeometryStatistics::GeometryStatistics() {
 GeometryStatistics::GeometryStatistics(std::unique_ptr<GeometryStatisticsImpl> impl)
     : impl_(std::move(impl)) {}
 
+GeometryStatistics::GeometryStatistics(GeometryStatistics&&) = default;
+
 GeometryStatistics::~GeometryStatistics() = default;
 
 bool GeometryStatistics::Equals(const GeometryStatistics& other) const {
@@ -274,6 +282,8 @@ void GeometryStatistics::Update(const ::arrow::Array& values, bool update_counts
   impl_->Update(values, update_counts);
 }
 
+void GeometryStatistics::Reset() { impl_->Reset(); }
+
 bool GeometryStatistics::is_valid() const { return impl_->is_valid(); }
 
 EncodedGeometryStatistics GeometryStatistics::Encode() { return impl_->Encode(); }
@@ -990,6 +1000,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
   void Reset() override {
     ResetCounts();
     ResetHasFlags();
+    if (HasGeometryStatistics()) {
+      geometry_statistics_->Reset();
+    }
   }
 
   void SetMinMax(const T& arg_min, const T& arg_max) override {
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index fc8d914bbc2..430bff9fd45 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -146,6 +146,7 @@ class PARQUET_EXPORT GeometryStatistics {
  public:
   GeometryStatistics();
   explicit GeometryStatistics(std::unique_ptr<GeometryStatisticsImpl> impl);
+  GeometryStatistics(GeometryStatistics&&);
 
   ~GeometryStatistics();
 
@@ -161,6 +162,8 @@ class PARQUET_EXPORT GeometryStatistics {
 
   void Update(const ::arrow::Array& values, bool update_counts);
 
+  void Reset();
+
   EncodedGeometryStatistics Encode();
 
   bool is_valid() const;
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index 33d43221efe..5769c99f0bd 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -231,6 +231,36 @@ static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) {
                      aesGcmCtrV1.supply_aad_prefix};
 }
 
+static inline EncodedGeometryStatistics FromThrift(
+    const format::GeometryStatistics& geometry_stats, bool has_geometry_stats) {
+  EncodedGeometryStatistics out;
+
+  if (has_geometry_stats) {
+    out.geometry_types = geometry_stats.geometry_types;
+
+    out.xmin = geometry_stats.bbox.xmin;
+    out.xmax = geometry_stats.bbox.xmax;
+    out.ymin = geometry_stats.bbox.ymin;
+    out.ymax = geometry_stats.bbox.ymax;
+
+    if (geometry_stats.bbox.__isset.zmin && geometry_stats.bbox.__isset.zmax) {
+      out.zmin = geometry_stats.bbox.zmin;
+      out.zmax = geometry_stats.bbox.zmax;
+    }
+
+    if (geometry_stats.bbox.__isset.mmin && geometry_stats.bbox.__isset.mmax) {
+      out.mmin = geometry_stats.bbox.mmin;
+      out.mmax = geometry_stats.bbox.mmax;
+    }
+
+    for (const auto& covering : geometry_stats.coverings) {
+      out.coverings.emplace_back(covering.kind, covering.value);
+    }
+  }
+
+  return out;
+}
+
 static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) {
   EncryptionAlgorithm encryption_algorithm;
 
@@ -323,6 +353,37 @@ static inline format::SortingColumn ToThrift(SortingColumn sorting_column) {
   return thrift_sorting_column;
 }
 
+static inline format::GeometryStatistics ToThrift(
+    const EncodedGeometryStatistics& encoded_geometry_stats) {
+  format::GeometryStatistics geometry_statistics;
+  geometry_statistics.__set_geometry_types(encoded_geometry_stats.geometry_types);
+  format::BoundingBox bbox;
+  bbox.__set_xmin(encoded_geometry_stats.xmin);
+  bbox.__set_xmax(encoded_geometry_stats.xmax);
+  bbox.__set_ymin(encoded_geometry_stats.ymin);
+  bbox.__set_ymax(encoded_geometry_stats.ymax);
+  if (encoded_geometry_stats.has_z()) {
+    bbox.__set_zmin(encoded_geometry_stats.zmin);
+    bbox.__set_zmax(encoded_geometry_stats.zmax);
+  }
+  if (encoded_geometry_stats.has_m()) {
+    bbox.__set_mmin(encoded_geometry_stats.mmin);
+    bbox.__set_mmax(encoded_geometry_stats.mmax);
+  }
+  geometry_statistics.__set_bbox(bbox);
+
+  std::vector<format::Covering> coverings;
+  coverings.reserve(encoded_geometry_stats.coverings.size());
+  for (const auto& pair : encoded_geometry_stats.coverings) {
+    format::Covering covering;
+    covering.__set_kind(pair.first);
+    covering.__set_value(pair.second);
+    coverings.push_back(std::move(covering));
+  }
+  geometry_statistics.__set_coverings(coverings);
+  return geometry_statistics;
+}
+
 static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
   format::Statistics statistics;
   if (stats.has_min) {
@@ -349,34 +410,7 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
   }
 
   if (stats.has_geometry_statistics) {
-    const EncodedGeometryStatistics& encoded_geometry_stats = stats.geometry_statistics();
-    format::GeometryStatistics geometry_statistics;
-    geometry_statistics.__set_geometry_types(encoded_geometry_stats.geometry_types);
-    format::BoundingBox bbox;
-    bbox.__set_xmin(encoded_geometry_stats.xmin);
-    bbox.__set_xmax(encoded_geometry_stats.xmax);
-    bbox.__set_ymin(encoded_geometry_stats.ymin);
-    bbox.__set_ymax(encoded_geometry_stats.ymax);
-    if (encoded_geometry_stats.has_z()) {
-      bbox.__set_zmin(encoded_geometry_stats.zmin);
-      bbox.__set_zmax(encoded_geometry_stats.zmax);
-    }
-    if (encoded_geometry_stats.has_m()) {
-      bbox.__set_mmin(encoded_geometry_stats.mmin);
-      bbox.__set_mmax(encoded_geometry_stats.mmax);
-    }
-    geometry_statistics.__set_bbox(bbox);
-
-    std::vector<format::Covering> coverings;
-    coverings.reserve(encoded_geometry_stats.coverings.size());
-    for (const auto& pair : encoded_geometry_stats.coverings) {
-      format::Covering covering;
-      covering.__set_kind(pair.first);
-      covering.__set_value(pair.second);
-      coverings.push_back(std::move(covering));
-    }
-    geometry_statistics.__set_coverings(coverings);
-    statistics.__set_geometry_stats(geometry_statistics);
+    statistics.__set_geometry_stats(ToThrift(stats.geometry_statistics()));
   }
 
   return statistics;

From 174e1e19231eb09e549557c7048d41a253bfb8f3 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Thu, 12 Sep 2024 19:59:49 +0800
Subject: [PATCH 155/157] Fix compiler warnings on AMD platforms as well as
 sanitizer warnings

---
 cpp/src/parquet/geometry_util_internal.h |  3 ++-
 cpp/src/parquet/reader_test.cc           | 11 +++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/src/parquet/geometry_util_internal.h b/cpp/src/parquet/geometry_util_internal.h
index a1dd9bb6da7..b2c81606f4e 100644
--- a/cpp/src/parquet/geometry_util_internal.h
+++ b/cpp/src/parquet/geometry_util_internal.h
@@ -432,7 +432,8 @@ class WKBSequenceBounder {
 class WKBGenericSequenceBounder {
  public:
   WKBGenericSequenceBounder()
-      : xy_(chunk_),
+      : chunk_{0.0},
+        xy_(chunk_),
         xyz_(chunk_),
         xym_(chunk_),
         xyzm_(chunk_),
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 887ebdb4a52..a84cee60cb8 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1836,15 +1836,14 @@ class TestGeometryLogicalType : public ::testing::Test {
 
     // Write small batches and small data pages
 
-    auto writer_props_builder = WriterProperties::Builder()
-                                    .write_batch_size(64)
-                                    ->data_pagesize(128)
-                                    ->data_page_version(data_page_version);
+    auto writer_props_builder = WriterProperties::Builder();
+    writer_props_builder.write_batch_size(64)->data_pagesize(128)->data_page_version(
+        data_page_version);
     if (enable_write_page_index) {
-      writer_props_builder->enable_write_page_index();
+      writer_props_builder.enable_write_page_index();
     }
 
-    std::shared_ptr<WriterProperties> writer_props = writer_props_builder->build();
+    std::shared_ptr<WriterProperties> writer_props = writer_props_builder.build();
 
     ASSERT_OK_AND_ASSIGN(auto out_file, ::arrow::io::BufferOutputStream::Create());
     std::shared_ptr<ParquetFileWriter> file_writer =

From bd0e2ad2c857f281042eeed74fb7196183a5cf62 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Thu, 12 Sep 2024 20:36:45 +0800
Subject: [PATCH 156/157] Remove all newly added include directives

---
 cpp/src/parquet/statistics.cc | 4 ----
 cpp/src/parquet/statistics.h  | 1 -
 2 files changed, 5 deletions(-)

diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index 65346a487c5..23e4d230b22 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -21,20 +21,17 @@
 #include <cmath>
 #include <cstring>
 #include <limits>
-#include <memory>
 #include <optional>
 #include <type_traits>
 #include <utility>
 
 #include "arrow/array.h"
-#include "arrow/array/array_binary.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/float16.h"
 #include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
 #include "arrow/util/ubsan.h"
 #include "arrow/visit_data_inline.h"
 #include "parquet/encoding.h"
@@ -42,7 +39,6 @@
 #include "parquet/geometry_util_internal.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
-#include "parquet/types.h"
 
 using arrow::default_memory_pool;
 using arrow::MemoryPool;
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index 430bff9fd45..b80bf50d9ab 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -24,7 +24,6 @@
 #include <string>
 #include <utility>
 
-#include "arrow/util/macros.h"
 #include "parquet/platform.h"
 #include "parquet/types.h"
 

From 1521bac133350da3d671ee2c4308549560f31ee1 Mon Sep 17 00:00:00 2001
From: Kristin Cowalcijk <bo@wherobots.com>
Date: Thu, 12 Sep 2024 23:19:18 +0800
Subject: [PATCH 157/157] include cmath for std::isnan

---
 cpp/src/parquet/geometry_util_internal.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/parquet/geometry_util_internal.h b/cpp/src/parquet/geometry_util_internal.h
index b2c81606f4e..b943ff6e204 100644
--- a/cpp/src/parquet/geometry_util_internal.h
+++ b/cpp/src/parquet/geometry_util_internal.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <algorithm>
+#include <cmath>
 #include <limits>
 #include <string>
 #include <unordered_set>