Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 65 additions & 65 deletions python/pyspark/sql/connect/proto/expressions_pb2.py

Large diffs are not rendered by default.

95 changes: 24 additions & 71 deletions python/pyspark/sql/connect/proto/expressions_pb2.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,6 @@ class Expression(google.protobuf.message.Message):

ELEMENT_TYPE_FIELD_NUMBER: builtins.int
ELEMENTS_FIELD_NUMBER: builtins.int
DATA_TYPE_FIELD_NUMBER: builtins.int
@property
def element_type(self) -> pyspark.sql.connect.proto.types_pb2.DataType:
"""(Deprecated) The element type of the array.
Expand All @@ -509,42 +508,20 @@ class Expression(google.protobuf.message.Message):
) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[
global___Expression.Literal
]:
"""The literal values that make up the array elements.

For inferring the data_type.element_type, only the first element needs to
contain the type information.
"""
@property
def data_type(self) -> pyspark.sql.connect.proto.types_pb2.DataType.Array:
"""The type of the array. You don't need to set this field if the type information is not needed.

If the element type can be inferred from the first element of the elements field,
then you don't need to set data_type.element_type to save space.

On the other hand, redundant type information is also acceptable.
"""
"""The literal values that make up the array elements."""
def __init__(
self,
*,
element_type: pyspark.sql.connect.proto.types_pb2.DataType | None = ...,
elements: collections.abc.Iterable[global___Expression.Literal] | None = ...,
data_type: pyspark.sql.connect.proto.types_pb2.DataType.Array | None = ...,
) -> None: ...
def HasField(
self,
field_name: typing_extensions.Literal[
"data_type", b"data_type", "element_type", b"element_type"
],
self, field_name: typing_extensions.Literal["element_type", b"element_type"]
) -> builtins.bool: ...
def ClearField(
self,
field_name: typing_extensions.Literal[
"data_type",
b"data_type",
"element_type",
b"element_type",
"elements",
b"elements",
"element_type", b"element_type", "elements", b"elements"
],
) -> None: ...

Expand All @@ -555,7 +532,6 @@ class Expression(google.protobuf.message.Message):
VALUE_TYPE_FIELD_NUMBER: builtins.int
KEYS_FIELD_NUMBER: builtins.int
VALUES_FIELD_NUMBER: builtins.int
DATA_TYPE_FIELD_NUMBER: builtins.int
@property
def key_type(self) -> pyspark.sql.connect.proto.types_pb2.DataType:
"""(Deprecated) The key type of the map.
Expand All @@ -575,51 +551,31 @@ class Expression(google.protobuf.message.Message):
) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[
global___Expression.Literal
]:
"""The literal keys that make up the map.

For inferring the data_type.key_type, only the first key needs to
contain the type information.
"""
"""The literal keys that make up the map."""
@property
def values(
self,
) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[
global___Expression.Literal
]:
"""The literal values that make up the map.

For inferring the data_type.value_type, only the first value needs to
contain the type information.
"""
@property
def data_type(self) -> pyspark.sql.connect.proto.types_pb2.DataType.Map:
"""The type of the map. You don't need to set this field if the type information is not needed.

If the key/value types can be inferred from the first element of the keys/values fields,
then you don't need to set data_type.key_type/data_type.value_type to save space.

On the other hand, redundant type information is also acceptable.
"""
"""The literal values that make up the map."""
def __init__(
self,
*,
key_type: pyspark.sql.connect.proto.types_pb2.DataType | None = ...,
value_type: pyspark.sql.connect.proto.types_pb2.DataType | None = ...,
keys: collections.abc.Iterable[global___Expression.Literal] | None = ...,
values: collections.abc.Iterable[global___Expression.Literal] | None = ...,
data_type: pyspark.sql.connect.proto.types_pb2.DataType.Map | None = ...,
) -> None: ...
def HasField(
self,
field_name: typing_extensions.Literal[
"data_type", b"data_type", "key_type", b"key_type", "value_type", b"value_type"
"key_type", b"key_type", "value_type", b"value_type"
],
) -> builtins.bool: ...
def ClearField(
self,
field_name: typing_extensions.Literal[
"data_type",
b"data_type",
"key_type",
b"key_type",
"keys",
Expand All @@ -636,50 +592,33 @@ class Expression(google.protobuf.message.Message):

STRUCT_TYPE_FIELD_NUMBER: builtins.int
ELEMENTS_FIELD_NUMBER: builtins.int
DATA_TYPE_STRUCT_FIELD_NUMBER: builtins.int
@property
def struct_type(self) -> pyspark.sql.connect.proto.types_pb2.DataType:
"""(Deprecated) The type of the struct.

This field is deprecated since Spark 4.1+ because using DataType as the type of a struct
is ambiguous. Use data_type_struct field instead.
is ambiguous. Use data_type field instead.
"""
@property
def elements(
self,
) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[
global___Expression.Literal
]:
"""(Required) The literal values that make up the struct elements."""
@property
def data_type_struct(self) -> pyspark.sql.connect.proto.types_pb2.DataType.Struct:
"""The type of the struct. You don't need to set this field if the type information is not needed.

Whether data_type_struct.fields.data_type should be set depends on
whether each field's type can be inferred from the elements field.
"""
"""The literal values that make up the struct elements."""
def __init__(
self,
*,
struct_type: pyspark.sql.connect.proto.types_pb2.DataType | None = ...,
elements: collections.abc.Iterable[global___Expression.Literal] | None = ...,
data_type_struct: pyspark.sql.connect.proto.types_pb2.DataType.Struct | None = ...,
) -> None: ...
def HasField(
self,
field_name: typing_extensions.Literal[
"data_type_struct", b"data_type_struct", "struct_type", b"struct_type"
],
self, field_name: typing_extensions.Literal["struct_type", b"struct_type"]
) -> builtins.bool: ...
def ClearField(
self,
field_name: typing_extensions.Literal[
"data_type_struct",
b"data_type_struct",
"elements",
b"elements",
"struct_type",
b"struct_type",
"elements", b"elements", "struct_type", b"struct_type"
],
) -> None: ...

Expand Down Expand Up @@ -811,6 +750,7 @@ class Expression(google.protobuf.message.Message):
STRUCT_FIELD_NUMBER: builtins.int
SPECIALIZED_ARRAY_FIELD_NUMBER: builtins.int
TIME_FIELD_NUMBER: builtins.int
DATA_TYPE_FIELD_NUMBER: builtins.int
@property
def null(self) -> pyspark.sql.connect.proto.types_pb2.DataType: ...
binary: builtins.bytes
Expand Down Expand Up @@ -844,6 +784,14 @@ class Expression(google.protobuf.message.Message):
def specialized_array(self) -> global___Expression.Literal.SpecializedArray: ...
@property
def time(self) -> global___Expression.Literal.Time: ...
@property
def data_type(self) -> pyspark.sql.connect.proto.types_pb2.DataType:
"""Data type information for the literal.
This field is required only in the root literal message for null values or
for data types (e.g., array, map, or struct) with non-trivial information.
If the data_type field is not set at the root level, the data type will be
inferred or retrieved from the deprecated data type fields using best efforts.
"""
def __init__(
self,
*,
Expand All @@ -869,6 +817,7 @@ class Expression(google.protobuf.message.Message):
struct: global___Expression.Literal.Struct | None = ...,
specialized_array: global___Expression.Literal.SpecializedArray | None = ...,
time: global___Expression.Literal.Time | None = ...,
data_type: pyspark.sql.connect.proto.types_pb2.DataType | None = ...,
) -> None: ...
def HasField(
self,
Expand All @@ -883,6 +832,8 @@ class Expression(google.protobuf.message.Message):
b"byte",
"calendar_interval",
b"calendar_interval",
"data_type",
b"data_type",
"date",
b"date",
"day_time_interval",
Expand Down Expand Up @@ -934,6 +885,8 @@ class Expression(google.protobuf.message.Message):
b"byte",
"calendar_interval",
b"calendar_interval",
"data_type",
b"data_type",
"date",
b"date",
"day_time_interval",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,23 +78,17 @@ class ColumnNodeToProtoConverterSuite extends ConnectFunSuite {
testConversion(
Literal((12.0, "north", 60.0, "west"), Option(dataType)),
expr { b =>
val builder = b.getLiteralBuilder.getStructBuilder
builder
b.getLiteralBuilder.getStructBuilder
.addElements(proto.Expression.Literal.newBuilder().setDouble(12.0).build())
builder
.addElements(proto.Expression.Literal.newBuilder().setString("north").build())
builder
.addElements(proto.Expression.Literal.newBuilder().setDouble(60.0).build())
builder
.addElements(proto.Expression.Literal.newBuilder().setString("west").build())
builder.setDataTypeStruct(
b.getLiteralBuilder.getDataTypeBuilder.setStruct(
proto.DataType.Struct
.newBuilder()
.addFields(
proto.DataType.StructField.newBuilder().setName("_1").setNullable(true).build())
.addFields(structField("_1", ProtoDataTypes.DoubleType))
.addFields(structField("_2", stringTypeWithCollation))
.addFields(
proto.DataType.StructField.newBuilder().setName("_3").setNullable(true).build())
.addFields(structField("_3", ProtoDataTypes.DoubleType))
.addFields(structField("_4", stringTypeWithCollation))
.build())
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,13 @@ message Expression {
Time time = 26;
}

// Data type information for the literal.
// This field is required only in the root literal message for null values or
// for data types (e.g., array, map, or struct) with non-trivial information.
// If the data_type field is not set at the root level, the data type will be
// inferred or retrieved from the deprecated data type fields using best efforts.
DataType data_type = 100;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this an optional field? For simple literals, such as byte, int, we can actually infer the data type.

Copy link
Contributor Author

@heyihong heyihong Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In proto3, fields that are message types are always optional. If the data_type field is not set, we fall back to inferring the data type or using the deprecated data type fields. I will update the comment.

In my opinion, it is better to always set the data_type field, because avoiding data type inference simplifies both the protocol and its implementation across different languages.


message Decimal {
// the string representation.
string value = 1;
Expand All @@ -230,18 +237,7 @@ message Expression {
DataType element_type = 1 [deprecated = true];

// The literal values that make up the array elements.
//
// For inferring the data_type.element_type, only the first element needs to
// contain the type information.
repeated Literal elements = 2;

// The type of the array. You don't need to set this field if the type information is not needed.
//
// If the element type can be inferred from the first element of the elements field,
// then you don't need to set data_type.element_type to save space.
//
// On the other hand, redundant type information is also acceptable.
DataType.Array data_type = 3;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this compatible with the old client versions?

Copy link
Contributor Author

@heyihong heyihong Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the old client versions don't use these removed fields. Also , PySpark Connect still uses the old client version, and it works.

}

message Map {
Expand All @@ -257,41 +253,21 @@ message Expression {
DataType value_type = 2 [deprecated = true];

// The literal keys that make up the map.
//
// For inferring the data_type.key_type, only the first key needs to
// contain the type information.
repeated Literal keys = 3;

// The literal values that make up the map.
//
// For inferring the data_type.value_type, only the first value needs to
// contain the type information.
repeated Literal values = 4;

// The type of the map. You don't need to set this field if the type information is not needed.
//
// If the key/value types can be inferred from the first element of the keys/values fields,
// then you don't need to set data_type.key_type/data_type.value_type to save space.
//
// On the other hand, redundant type information is also acceptable.
DataType.Map data_type = 5;
}

message Struct {
// (Deprecated) The type of the struct.
//
// This field is deprecated since Spark 4.1+ because using DataType as the type of a struct
// is ambiguous. Use data_type_struct field instead.
// is ambiguous. Use data_type field instead.
DataType struct_type = 1 [deprecated = true];

// (Required) The literal values that make up the struct elements.
// The literal values that make up the struct elements.
repeated Literal elements = 2;

// The type of the struct. You don't need to set this field if the type information is not needed.
//
// Whether data_type_struct.fields.data_type should be set depends on
// whether each field's type can be inferred from the elements field.
DataType.Struct data_type_struct = 3;
}

message SpecializedArray {
Expand Down
Loading