diff --git a/connector/connect/src/main/protobuf/spark/connect/relations.proto b/connector/connect/src/main/protobuf/spark/connect/relations.proto index 52ff780d093a..aef4e4e7c642 100644 --- a/connector/connect/src/main/protobuf/spark/connect/relations.proto +++ b/connector/connect/src/main/protobuf/spark/connect/relations.proto @@ -50,6 +50,7 @@ message Relation { RenameColumnsBySameLengthNames rename_columns_by_same_length_names = 18; RenameColumnsByNameToNameMap rename_columns_by_name_to_name_map = 19; ShowString show_string = 20; + Drop drop = 21; // NA functions NAFill fill_na = 90; @@ -252,6 +253,19 @@ message Sort { } } + +// Drop specified columns. +message Drop { + // (Required) The input relation. + Relation input = 1; + + // (Required) columns to drop. + // + // Should contain at least 1 item. + repeated Expression cols = 2; +} + + // Relation of type [[Deduplicate]] which have duplicate rows removed, could consider either only // the subset of columns or all the columns. message Deduplicate { diff --git a/connector/connect/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala b/connector/connect/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala index caff3d8f0713..1827aa4e3c00 100644 --- a/connector/connect/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala +++ b/connector/connect/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala @@ -510,6 +510,28 @@ package object dsl { .build() } + def drop(columns: String*): Relation = { + assert(columns.nonEmpty) + + val cols = columns.map(col => + Expression.newBuilder + .setUnresolvedAttribute( + Expression.UnresolvedAttribute.newBuilder + .setUnparsedIdentifier(col) + .build()) + .build()) + + Relation + .newBuilder() + .setDrop( + Drop + .newBuilder() + .setInput(logicalPlan) + .addAllCols(cols.asJava) + .build()) + .build() + } + def groupBy(groupingExprs: Expression*)(aggregateExprs: Expression*): Relation = { val agg = Aggregate.newBuilder() agg.setInput(logicalPlan) diff --git a/connector/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index 232f6e10474d..96d0dbe35803 100644 --- a/connector/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -25,7 +25,7 @@ import com.google.common.collect.{Lists, Maps} import org.apache.spark.api.python.{PythonEvalType, SimplePythonFunction} import org.apache.spark.connect.proto import org.apache.spark.connect.proto.WriteOperation -import org.apache.spark.sql.{Dataset, SparkSession} +import org.apache.spark.sql.{Column, Dataset, SparkSession} import org.apache.spark.sql.catalyst.AliasIdentifier import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, MultiAlias, UnresolvedAlias, UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} import org.apache.spark.sql.catalyst.expressions @@ -69,6 +69,7 @@ class SparkConnectPlanner(session: SparkSession) { case proto.Relation.RelTypeCase.DEDUPLICATE => transformDeduplicate(rel.getDeduplicate) case proto.Relation.RelTypeCase.SET_OP => transformSetOperation(rel.getSetOp) case proto.Relation.RelTypeCase.SORT => transformSort(rel.getSort) + case proto.Relation.RelTypeCase.DROP => transformDrop(rel.getDrop) case proto.Relation.RelTypeCase.AGGREGATE => transformAggregate(rel.getAggregate) case proto.Relation.RelTypeCase.SQL => transformSql(rel.getSql) case proto.Relation.RelTypeCase.LOCAL_RELATION => @@ -523,6 +524,19 @@ class SparkConnectPlanner(session: SparkSession) { sameOrderExpressions = Seq.empty) } + private def transformDrop(rel: proto.Drop): LogicalPlan = { + assert(rel.getColsCount > 0, s"cols must contains at least 1 item!") + + val cols = rel.getColsList.asScala.toArray.map { expr => + Column(transformExpression(expr)) + } + + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .drop(cols.head, cols.tail: _*) + .logicalPlan + } + private def transformAggregate(rel: proto.Aggregate): LogicalPlan = { assert(rel.hasInput) diff --git a/connector/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala b/connector/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala index 404581445d0c..2a68a5627f15 100644 --- a/connector/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala +++ b/connector/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala @@ -148,6 +148,23 @@ class SparkConnectProtoSuite extends PlanTest with SparkConnectPlanTest { comparePlans(connectPlan2, sparkPlan2) } + test("SPARK-41169: Test drop") { + // single column + val connectPlan = connectTestRelation.drop("id") + val sparkPlan = sparkTestRelation.drop("id") + comparePlans(connectPlan, sparkPlan) + + // all columns + val connectPlan2 = connectTestRelation.drop("id", "name") + val sparkPlan2 = sparkTestRelation.drop("id", "name") + comparePlans(connectPlan2, sparkPlan2) + + // non-existing column + val connectPlan3 = connectTestRelation.drop("id2", "name") + val sparkPlan3 = sparkTestRelation.drop("id2", "name") + comparePlans(connectPlan3, sparkPlan3) + } + test("SPARK-40809: column alias") { // Simple Test. val connectPlan = connectTestRelation.select("id".protoAttr.as("id2")) diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index 15aa028b11b1..50b1670bfbcd 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -257,10 +257,21 @@ def distinct(self) -> "DataFrame": ) def drop(self, *cols: "ColumnOrString") -> "DataFrame": - all_cols = self.columns - dropped = set([c.name() if isinstance(c, Column) else self[c].name() for c in cols]) - dropped_cols = filter(lambda x: x in dropped, all_cols) - return DataFrame.withPlan(plan.Project(self._plan, *dropped_cols), session=self._session) + _cols = list(cols) + if any(not isinstance(c, (str, Column)) for c in _cols): + raise TypeError( + f"'cols' must contains strings or Columns, but got {type(cols).__name__}" + ) + if len(_cols) == 0: + raise ValueError("'cols' must be non-empty") + + return DataFrame.withPlan( + plan.Drop( + child=self._plan, + columns=_cols, + ), + session=self._session, + ) def filter(self, condition: Expression) -> "DataFrame": return DataFrame.withPlan( diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index 63f1b8fa5410..ffb0ce080b30 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -464,6 +464,49 @@ def _repr_html_(self) -> str: """ +class Drop(LogicalPlan): + def __init__( + self, + child: Optional["LogicalPlan"], + columns: List[Union[Column, str]], + ) -> None: + super().__init__(child) + assert len(columns) > 0 and all(isinstance(c, (Column, str)) for c in columns) + self.columns = columns + + def _convert_to_expr( + self, col: Union[Column, str], session: "RemoteSparkSession" + ) -> proto.Expression: + expr = proto.Expression() + if isinstance(col, Column): + expr.CopyFrom(col.to_plan(session)) + else: + expr.CopyFrom(self.unresolved_attr(col)) + return expr + + def plan(self, session: "RemoteSparkSession") -> proto.Relation: + assert self._child is not None + plan = proto.Relation() + plan.drop.input.CopyFrom(self._child.plan(session)) + plan.drop.cols.extend([self._convert_to_expr(c, session) for c in self.columns]) + return plan + + def print(self, indent: int = 0) -> str: + c_buf = self._child.print(indent + LogicalPlan.INDENT) if self._child else "" + return f"{' ' * indent}\n{c_buf}" + + def _repr_html_(self) -> str: + return f""" + + """ + + class Sample(LogicalPlan): def __init__( self, diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py b/python/pyspark/sql/connect/proto/relations_pb2.py index c586e4bdedaf..344caa3ea37e 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.py +++ b/python/pyspark/sql/connect/proto/relations_pb2.py @@ -33,7 +33,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x1fspark/connect/expressions.proto"\xa6\x0b\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0b\x32\x13.spark.connect.JoinH\x00R\x04join\x12\x34\n\x06set_op\x18\x06 \x01(\x0b\x32\x1b.spark.connect.SetOperationH\x00R\x05setOp\x12)\n\x04sort\x18\x07 \x01(\x0b\x32\x13.spark.connect.SortH\x00R\x04sort\x12,\n\x05limit\x18\x08 \x01(\x0b\x32\x14.spark.connect.LimitH\x00R\x05limit\x12\x38\n\taggregate\x18\t \x01(\x0b\x32\x18.spark.connect.AggregateH\x00R\taggregate\x12&\n\x03sql\x18\n \x01(\x0b\x32\x12.spark.connect.SQLH\x00R\x03sql\x12\x45\n\x0elocal_relation\x18\x0b \x01(\x0b\x32\x1c.spark.connect.LocalRelationH\x00R\rlocalRelation\x12/\n\x06sample\x18\x0c \x01(\x0b\x32\x15.spark.connect.SampleH\x00R\x06sample\x12/\n\x06offset\x18\r \x01(\x0b\x32\x15.spark.connect.OffsetH\x00R\x06offset\x12>\n\x0b\x64\x65\x64uplicate\x18\x0e \x01(\x0b\x32\x1a.spark.connect.DeduplicateH\x00R\x0b\x64\x65\x64uplicate\x12,\n\x05range\x18\x0f \x01(\x0b\x32\x14.spark.connect.RangeH\x00R\x05range\x12\x45\n\x0esubquery_alias\x18\x10 \x01(\x0b\x32\x1c.spark.connect.SubqueryAliasH\x00R\rsubqueryAlias\x12>\n\x0brepartition\x18\x11 \x01(\x0b\x32\x1a.spark.connect.RepartitionH\x00R\x0brepartition\x12|\n#rename_columns_by_same_length_names\x18\x12 \x01(\x0b\x32-.spark.connect.RenameColumnsBySameLengthNamesH\x00R\x1erenameColumnsBySameLengthNames\x12w\n"rename_columns_by_name_to_name_map\x18\x13 \x01(\x0b\x32+.spark.connect.RenameColumnsByNameToNameMapH\x00R\x1crenameColumnsByNameToNameMap\x12<\n\x0bshow_string\x18\x14 \x01(\x0b\x32\x19.spark.connect.ShowStringH\x00R\nshowString\x12\x30\n\x07\x66ill_na\x18Z \x01(\x0b\x32\x15.spark.connect.NAFillH\x00R\x06\x66illNa\x12\x36\n\x07summary\x18\x64 \x01(\x0b\x32\x1a.spark.connect.StatSummaryH\x00R\x07summary\x12\x39\n\x08\x63rosstab\x18\x65 \x01(\x0b\x32\x1b.spark.connect.StatCrosstabH\x00R\x08\x63rosstab\x12\x33\n\x07unknown\x18\xe7\x07 \x01(\x0b\x32\x16.spark.connect.UnknownH\x00R\x07unknownB\n\n\x08rel_type"\t\n\x07Unknown"1\n\x0eRelationCommon\x12\x1f\n\x0bsource_info\x18\x01 \x01(\tR\nsourceInfo"\x1b\n\x03SQL\x12\x14\n\x05query\x18\x01 \x01(\tR\x05query"\xaa\x03\n\x04Read\x12\x41\n\x0bnamed_table\x18\x01 \x01(\x0b\x32\x1e.spark.connect.Read.NamedTableH\x00R\nnamedTable\x12\x41\n\x0b\x64\x61ta_source\x18\x02 \x01(\x0b\x32\x1e.spark.connect.Read.DataSourceH\x00R\ndataSource\x1a=\n\nNamedTable\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x1a\xcf\x01\n\nDataSource\x12\x16\n\x06\x66ormat\x18\x01 \x01(\tR\x06\x66ormat\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x00R\x06schema\x88\x01\x01\x12\x45\n\x07options\x18\x03 \x03(\x0b\x32+.spark.connect.Read.DataSource.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07_schemaB\x0b\n\tread_type"u\n\x07Project\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12;\n\x0b\x65xpressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0b\x65xpressions"p\n\x06\x46ilter\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x37\n\tcondition\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\tcondition"\xc2\x03\n\x04Join\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12@\n\x0ejoin_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\rjoinCondition\x12\x39\n\tjoin_type\x18\x04 \x01(\x0e\x32\x1c.spark.connect.Join.JoinTypeR\x08joinType\x12#\n\rusing_columns\x18\x05 \x03(\tR\x0cusingColumns"\xbb\x01\n\x08JoinType\x12\x19\n\x15JOIN_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOIN_TYPE_INNER\x10\x01\x12\x18\n\x14JOIN_TYPE_FULL_OUTER\x10\x02\x12\x18\n\x14JOIN_TYPE_LEFT_OUTER\x10\x03\x12\x19\n\x15JOIN_TYPE_RIGHT_OUTER\x10\x04\x12\x17\n\x13JOIN_TYPE_LEFT_ANTI\x10\x05\x12\x17\n\x13JOIN_TYPE_LEFT_SEMI\x10\x06"\x8c\x03\n\x0cSetOperation\x12\x36\n\nleft_input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\tleftInput\x12\x38\n\x0bright_input\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\nrightInput\x12\x45\n\x0bset_op_type\x18\x03 \x01(\x0e\x32%.spark.connect.SetOperation.SetOpTypeR\tsetOpType\x12\x1a\n\x06is_all\x18\x04 \x01(\x08H\x00R\x05isAll\x88\x01\x01\x12\x1c\n\x07\x62y_name\x18\x05 \x01(\x08H\x01R\x06\x62yName\x88\x01\x01"r\n\tSetOpType\x12\x1b\n\x17SET_OP_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15SET_OP_TYPE_INTERSECT\x10\x01\x12\x15\n\x11SET_OP_TYPE_UNION\x10\x02\x12\x16\n\x12SET_OP_TYPE_EXCEPT\x10\x03\x42\t\n\x07_is_allB\n\n\x08_by_name"L\n\x05Limit\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"O\n\x06Offset\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06offset\x18\x02 \x01(\x05R\x06offset"\xd2\x01\n\tAggregate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12H\n\x12result_expressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x11resultExpressions"\xa6\x04\n\x04Sort\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12>\n\x0bsort_fields\x18\x02 \x03(\x0b\x32\x1d.spark.connect.Sort.SortFieldR\nsortFields\x12 \n\tis_global\x18\x03 \x01(\x08H\x00R\x08isGlobal\x88\x01\x01\x1a\xbc\x01\n\tSortField\x12\x39\n\nexpression\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\nexpression\x12?\n\tdirection\x18\x02 \x01(\x0e\x32!.spark.connect.Sort.SortDirectionR\tdirection\x12\x33\n\x05nulls\x18\x03 \x01(\x0e\x32\x1d.spark.connect.Sort.SortNullsR\x05nulls"l\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x1c\n\x18SORT_DIRECTION_ASCENDING\x10\x01\x12\x1d\n\x19SORT_DIRECTION_DESCENDING\x10\x02"R\n\tSortNulls\x12\x1a\n\x16SORT_NULLS_UNSPECIFIED\x10\x00\x12\x14\n\x10SORT_NULLS_FIRST\x10\x01\x12\x13\n\x0fSORT_NULLS_LAST\x10\x02\x42\x0c\n\n_is_global"\xab\x01\n\x0b\x44\x65\x64uplicate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames\x12\x32\n\x13\x61ll_columns_as_keys\x18\x03 \x01(\x08H\x00R\x10\x61llColumnsAsKeys\x88\x01\x01\x42\x16\n\x14_all_columns_as_keys"]\n\rLocalRelation\x12L\n\nattributes\x18\x01 \x03(\x0b\x32,.spark.connect.Expression.QualifiedAttributeR\nattributes"\xe0\x01\n\x06Sample\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1f\n\x0blower_bound\x18\x02 \x01(\x01R\nlowerBound\x12\x1f\n\x0bupper_bound\x18\x03 \x01(\x01R\nupperBound\x12.\n\x10with_replacement\x18\x04 \x01(\x08H\x00R\x0fwithReplacement\x88\x01\x01\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x01R\x04seed\x88\x01\x01\x42\x13\n\x11_with_replacementB\x07\n\x05_seed"\x91\x01\n\x05Range\x12\x19\n\x05start\x18\x01 \x01(\x03H\x00R\x05start\x88\x01\x01\x12\x10\n\x03\x65nd\x18\x02 \x01(\x03R\x03\x65nd\x12\x12\n\x04step\x18\x03 \x01(\x03R\x04step\x12*\n\x0enum_partitions\x18\x04 \x01(\x05H\x01R\rnumPartitions\x88\x01\x01\x42\x08\n\x06_startB\x11\n\x0f_num_partitions"r\n\rSubqueryAlias\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05\x61lias\x18\x02 \x01(\tR\x05\x61lias\x12\x1c\n\tqualifier\x18\x03 \x03(\tR\tqualifier"\x8e\x01\n\x0bRepartition\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12%\n\x0enum_partitions\x18\x02 \x01(\x05R\rnumPartitions\x12\x1d\n\x07shuffle\x18\x03 \x01(\x08H\x00R\x07shuffle\x88\x01\x01\x42\n\n\x08_shuffle"\x8d\x01\n\nShowString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x18\n\x07numRows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate\x12\x1a\n\x08vertical\x18\x04 \x01(\x08R\x08vertical"\\\n\x0bStatSummary\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1e\n\nstatistics\x18\x02 \x03(\tR\nstatistics"e\n\x0cStatCrosstab\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"\x86\x01\n\x06NAFill\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x39\n\x06values\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values"r\n\x1eRenameColumnsBySameLengthNames\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames"\x83\x02\n\x1cRenameColumnsByNameToNameMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12o\n\x12rename_columns_map\x18\x02 \x03(\x0b\x32\x41.spark.connect.RenameColumnsByNameToNameMap.RenameColumnsMapEntryR\x10renameColumnsMap\x1a\x43\n\x15RenameColumnsMapEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42"\n\x1eorg.apache.spark.connect.protoP\x01\x62\x06proto3' + b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x1fspark/connect/expressions.proto"\xd1\x0b\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0b\x32\x13.spark.connect.JoinH\x00R\x04join\x12\x34\n\x06set_op\x18\x06 \x01(\x0b\x32\x1b.spark.connect.SetOperationH\x00R\x05setOp\x12)\n\x04sort\x18\x07 \x01(\x0b\x32\x13.spark.connect.SortH\x00R\x04sort\x12,\n\x05limit\x18\x08 \x01(\x0b\x32\x14.spark.connect.LimitH\x00R\x05limit\x12\x38\n\taggregate\x18\t \x01(\x0b\x32\x18.spark.connect.AggregateH\x00R\taggregate\x12&\n\x03sql\x18\n \x01(\x0b\x32\x12.spark.connect.SQLH\x00R\x03sql\x12\x45\n\x0elocal_relation\x18\x0b \x01(\x0b\x32\x1c.spark.connect.LocalRelationH\x00R\rlocalRelation\x12/\n\x06sample\x18\x0c \x01(\x0b\x32\x15.spark.connect.SampleH\x00R\x06sample\x12/\n\x06offset\x18\r \x01(\x0b\x32\x15.spark.connect.OffsetH\x00R\x06offset\x12>\n\x0b\x64\x65\x64uplicate\x18\x0e \x01(\x0b\x32\x1a.spark.connect.DeduplicateH\x00R\x0b\x64\x65\x64uplicate\x12,\n\x05range\x18\x0f \x01(\x0b\x32\x14.spark.connect.RangeH\x00R\x05range\x12\x45\n\x0esubquery_alias\x18\x10 \x01(\x0b\x32\x1c.spark.connect.SubqueryAliasH\x00R\rsubqueryAlias\x12>\n\x0brepartition\x18\x11 \x01(\x0b\x32\x1a.spark.connect.RepartitionH\x00R\x0brepartition\x12|\n#rename_columns_by_same_length_names\x18\x12 \x01(\x0b\x32-.spark.connect.RenameColumnsBySameLengthNamesH\x00R\x1erenameColumnsBySameLengthNames\x12w\n"rename_columns_by_name_to_name_map\x18\x13 \x01(\x0b\x32+.spark.connect.RenameColumnsByNameToNameMapH\x00R\x1crenameColumnsByNameToNameMap\x12<\n\x0bshow_string\x18\x14 \x01(\x0b\x32\x19.spark.connect.ShowStringH\x00R\nshowString\x12)\n\x04\x64rop\x18\x15 \x01(\x0b\x32\x13.spark.connect.DropH\x00R\x04\x64rop\x12\x30\n\x07\x66ill_na\x18Z \x01(\x0b\x32\x15.spark.connect.NAFillH\x00R\x06\x66illNa\x12\x36\n\x07summary\x18\x64 \x01(\x0b\x32\x1a.spark.connect.StatSummaryH\x00R\x07summary\x12\x39\n\x08\x63rosstab\x18\x65 \x01(\x0b\x32\x1b.spark.connect.StatCrosstabH\x00R\x08\x63rosstab\x12\x33\n\x07unknown\x18\xe7\x07 \x01(\x0b\x32\x16.spark.connect.UnknownH\x00R\x07unknownB\n\n\x08rel_type"\t\n\x07Unknown"1\n\x0eRelationCommon\x12\x1f\n\x0bsource_info\x18\x01 \x01(\tR\nsourceInfo"\x1b\n\x03SQL\x12\x14\n\x05query\x18\x01 \x01(\tR\x05query"\xaa\x03\n\x04Read\x12\x41\n\x0bnamed_table\x18\x01 \x01(\x0b\x32\x1e.spark.connect.Read.NamedTableH\x00R\nnamedTable\x12\x41\n\x0b\x64\x61ta_source\x18\x02 \x01(\x0b\x32\x1e.spark.connect.Read.DataSourceH\x00R\ndataSource\x1a=\n\nNamedTable\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x1a\xcf\x01\n\nDataSource\x12\x16\n\x06\x66ormat\x18\x01 \x01(\tR\x06\x66ormat\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x00R\x06schema\x88\x01\x01\x12\x45\n\x07options\x18\x03 \x03(\x0b\x32+.spark.connect.Read.DataSource.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07_schemaB\x0b\n\tread_type"u\n\x07Project\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12;\n\x0b\x65xpressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0b\x65xpressions"p\n\x06\x46ilter\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x37\n\tcondition\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\tcondition"\xc2\x03\n\x04Join\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12@\n\x0ejoin_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\rjoinCondition\x12\x39\n\tjoin_type\x18\x04 \x01(\x0e\x32\x1c.spark.connect.Join.JoinTypeR\x08joinType\x12#\n\rusing_columns\x18\x05 \x03(\tR\x0cusingColumns"\xbb\x01\n\x08JoinType\x12\x19\n\x15JOIN_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOIN_TYPE_INNER\x10\x01\x12\x18\n\x14JOIN_TYPE_FULL_OUTER\x10\x02\x12\x18\n\x14JOIN_TYPE_LEFT_OUTER\x10\x03\x12\x19\n\x15JOIN_TYPE_RIGHT_OUTER\x10\x04\x12\x17\n\x13JOIN_TYPE_LEFT_ANTI\x10\x05\x12\x17\n\x13JOIN_TYPE_LEFT_SEMI\x10\x06"\x8c\x03\n\x0cSetOperation\x12\x36\n\nleft_input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\tleftInput\x12\x38\n\x0bright_input\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\nrightInput\x12\x45\n\x0bset_op_type\x18\x03 \x01(\x0e\x32%.spark.connect.SetOperation.SetOpTypeR\tsetOpType\x12\x1a\n\x06is_all\x18\x04 \x01(\x08H\x00R\x05isAll\x88\x01\x01\x12\x1c\n\x07\x62y_name\x18\x05 \x01(\x08H\x01R\x06\x62yName\x88\x01\x01"r\n\tSetOpType\x12\x1b\n\x17SET_OP_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15SET_OP_TYPE_INTERSECT\x10\x01\x12\x15\n\x11SET_OP_TYPE_UNION\x10\x02\x12\x16\n\x12SET_OP_TYPE_EXCEPT\x10\x03\x42\t\n\x07_is_allB\n\n\x08_by_name"L\n\x05Limit\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"O\n\x06Offset\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06offset\x18\x02 \x01(\x05R\x06offset"\xd2\x01\n\tAggregate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12H\n\x12result_expressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x11resultExpressions"\xa6\x04\n\x04Sort\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12>\n\x0bsort_fields\x18\x02 \x03(\x0b\x32\x1d.spark.connect.Sort.SortFieldR\nsortFields\x12 \n\tis_global\x18\x03 \x01(\x08H\x00R\x08isGlobal\x88\x01\x01\x1a\xbc\x01\n\tSortField\x12\x39\n\nexpression\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\nexpression\x12?\n\tdirection\x18\x02 \x01(\x0e\x32!.spark.connect.Sort.SortDirectionR\tdirection\x12\x33\n\x05nulls\x18\x03 \x01(\x0e\x32\x1d.spark.connect.Sort.SortNullsR\x05nulls"l\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x1c\n\x18SORT_DIRECTION_ASCENDING\x10\x01\x12\x1d\n\x19SORT_DIRECTION_DESCENDING\x10\x02"R\n\tSortNulls\x12\x1a\n\x16SORT_NULLS_UNSPECIFIED\x10\x00\x12\x14\n\x10SORT_NULLS_FIRST\x10\x01\x12\x13\n\x0fSORT_NULLS_LAST\x10\x02\x42\x0c\n\n_is_global"d\n\x04\x44rop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12-\n\x04\x63ols\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x04\x63ols"\xab\x01\n\x0b\x44\x65\x64uplicate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames\x12\x32\n\x13\x61ll_columns_as_keys\x18\x03 \x01(\x08H\x00R\x10\x61llColumnsAsKeys\x88\x01\x01\x42\x16\n\x14_all_columns_as_keys"]\n\rLocalRelation\x12L\n\nattributes\x18\x01 \x03(\x0b\x32,.spark.connect.Expression.QualifiedAttributeR\nattributes"\xe0\x01\n\x06Sample\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1f\n\x0blower_bound\x18\x02 \x01(\x01R\nlowerBound\x12\x1f\n\x0bupper_bound\x18\x03 \x01(\x01R\nupperBound\x12.\n\x10with_replacement\x18\x04 \x01(\x08H\x00R\x0fwithReplacement\x88\x01\x01\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x01R\x04seed\x88\x01\x01\x42\x13\n\x11_with_replacementB\x07\n\x05_seed"\x91\x01\n\x05Range\x12\x19\n\x05start\x18\x01 \x01(\x03H\x00R\x05start\x88\x01\x01\x12\x10\n\x03\x65nd\x18\x02 \x01(\x03R\x03\x65nd\x12\x12\n\x04step\x18\x03 \x01(\x03R\x04step\x12*\n\x0enum_partitions\x18\x04 \x01(\x05H\x01R\rnumPartitions\x88\x01\x01\x42\x08\n\x06_startB\x11\n\x0f_num_partitions"r\n\rSubqueryAlias\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05\x61lias\x18\x02 \x01(\tR\x05\x61lias\x12\x1c\n\tqualifier\x18\x03 \x03(\tR\tqualifier"\x8e\x01\n\x0bRepartition\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12%\n\x0enum_partitions\x18\x02 \x01(\x05R\rnumPartitions\x12\x1d\n\x07shuffle\x18\x03 \x01(\x08H\x00R\x07shuffle\x88\x01\x01\x42\n\n\x08_shuffle"\x8d\x01\n\nShowString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x18\n\x07numRows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate\x12\x1a\n\x08vertical\x18\x04 \x01(\x08R\x08vertical"\\\n\x0bStatSummary\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1e\n\nstatistics\x18\x02 \x03(\tR\nstatistics"e\n\x0cStatCrosstab\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"\x86\x01\n\x06NAFill\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x39\n\x06values\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values"r\n\x1eRenameColumnsBySameLengthNames\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames"\x83\x02\n\x1cRenameColumnsByNameToNameMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12o\n\x12rename_columns_map\x18\x02 \x03(\x0b\x32\x41.spark.connect.RenameColumnsByNameToNameMap.RenameColumnsMapEntryR\x10renameColumnsMap\x1a\x43\n\x15RenameColumnsMapEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42"\n\x1eorg.apache.spark.connect.protoP\x01\x62\x06proto3' ) @@ -54,6 +54,7 @@ _AGGREGATE = DESCRIPTOR.message_types_by_name["Aggregate"] _SORT = DESCRIPTOR.message_types_by_name["Sort"] _SORT_SORTFIELD = _SORT.nested_types_by_name["SortField"] +_DROP = DESCRIPTOR.message_types_by_name["Drop"] _DEDUPLICATE = DESCRIPTOR.message_types_by_name["Deduplicate"] _LOCALRELATION = DESCRIPTOR.message_types_by_name["LocalRelation"] _SAMPLE = DESCRIPTOR.message_types_by_name["Sample"] @@ -256,6 +257,17 @@ _sym_db.RegisterMessage(Sort) _sym_db.RegisterMessage(Sort.SortField) +Drop = _reflection.GeneratedProtocolMessageType( + "Drop", + (_message.Message,), + { + "DESCRIPTOR": _DROP, + "__module__": "spark.connect.relations_pb2" + # @@protoc_insertion_point(class_scope:spark.connect.Drop) + }, +) +_sym_db.RegisterMessage(Drop) + Deduplicate = _reflection.GeneratedProtocolMessageType( "Deduplicate", (_message.Message,), @@ -407,71 +419,73 @@ _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._options = None _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_options = b"8\001" _RELATION._serialized_start = 82 - _RELATION._serialized_end = 1528 - _UNKNOWN._serialized_start = 1530 - _UNKNOWN._serialized_end = 1539 - _RELATIONCOMMON._serialized_start = 1541 - _RELATIONCOMMON._serialized_end = 1590 - _SQL._serialized_start = 1592 - _SQL._serialized_end = 1619 - _READ._serialized_start = 1622 - _READ._serialized_end = 2048 - _READ_NAMEDTABLE._serialized_start = 1764 - _READ_NAMEDTABLE._serialized_end = 1825 - _READ_DATASOURCE._serialized_start = 1828 - _READ_DATASOURCE._serialized_end = 2035 - _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 1966 - _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 2024 - _PROJECT._serialized_start = 2050 - _PROJECT._serialized_end = 2167 - _FILTER._serialized_start = 2169 - _FILTER._serialized_end = 2281 - _JOIN._serialized_start = 2284 - _JOIN._serialized_end = 2734 - _JOIN_JOINTYPE._serialized_start = 2547 - _JOIN_JOINTYPE._serialized_end = 2734 - _SETOPERATION._serialized_start = 2737 - _SETOPERATION._serialized_end = 3133 - _SETOPERATION_SETOPTYPE._serialized_start = 2996 - _SETOPERATION_SETOPTYPE._serialized_end = 3110 - _LIMIT._serialized_start = 3135 - _LIMIT._serialized_end = 3211 - _OFFSET._serialized_start = 3213 - _OFFSET._serialized_end = 3292 - _AGGREGATE._serialized_start = 3295 - _AGGREGATE._serialized_end = 3505 - _SORT._serialized_start = 3508 - _SORT._serialized_end = 4058 - _SORT_SORTFIELD._serialized_start = 3662 - _SORT_SORTFIELD._serialized_end = 3850 - _SORT_SORTDIRECTION._serialized_start = 3852 - _SORT_SORTDIRECTION._serialized_end = 3960 - _SORT_SORTNULLS._serialized_start = 3962 - _SORT_SORTNULLS._serialized_end = 4044 - _DEDUPLICATE._serialized_start = 4061 - _DEDUPLICATE._serialized_end = 4232 - _LOCALRELATION._serialized_start = 4234 - _LOCALRELATION._serialized_end = 4327 - _SAMPLE._serialized_start = 4330 - _SAMPLE._serialized_end = 4554 - _RANGE._serialized_start = 4557 - _RANGE._serialized_end = 4702 - _SUBQUERYALIAS._serialized_start = 4704 - _SUBQUERYALIAS._serialized_end = 4818 - _REPARTITION._serialized_start = 4821 - _REPARTITION._serialized_end = 4963 - _SHOWSTRING._serialized_start = 4966 - _SHOWSTRING._serialized_end = 5107 - _STATSUMMARY._serialized_start = 5109 - _STATSUMMARY._serialized_end = 5201 - _STATCROSSTAB._serialized_start = 5203 - _STATCROSSTAB._serialized_end = 5304 - _NAFILL._serialized_start = 5307 - _NAFILL._serialized_end = 5441 - _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_start = 5443 - _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_end = 5557 - _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_start = 5560 - _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_end = 5819 - _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_start = 5752 - _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_end = 5819 + _RELATION._serialized_end = 1571 + _UNKNOWN._serialized_start = 1573 + _UNKNOWN._serialized_end = 1582 + _RELATIONCOMMON._serialized_start = 1584 + _RELATIONCOMMON._serialized_end = 1633 + _SQL._serialized_start = 1635 + _SQL._serialized_end = 1662 + _READ._serialized_start = 1665 + _READ._serialized_end = 2091 + _READ_NAMEDTABLE._serialized_start = 1807 + _READ_NAMEDTABLE._serialized_end = 1868 + _READ_DATASOURCE._serialized_start = 1871 + _READ_DATASOURCE._serialized_end = 2078 + _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 2009 + _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 2067 + _PROJECT._serialized_start = 2093 + _PROJECT._serialized_end = 2210 + _FILTER._serialized_start = 2212 + _FILTER._serialized_end = 2324 + _JOIN._serialized_start = 2327 + _JOIN._serialized_end = 2777 + _JOIN_JOINTYPE._serialized_start = 2590 + _JOIN_JOINTYPE._serialized_end = 2777 + _SETOPERATION._serialized_start = 2780 + _SETOPERATION._serialized_end = 3176 + _SETOPERATION_SETOPTYPE._serialized_start = 3039 + _SETOPERATION_SETOPTYPE._serialized_end = 3153 + _LIMIT._serialized_start = 3178 + _LIMIT._serialized_end = 3254 + _OFFSET._serialized_start = 3256 + _OFFSET._serialized_end = 3335 + _AGGREGATE._serialized_start = 3338 + _AGGREGATE._serialized_end = 3548 + _SORT._serialized_start = 3551 + _SORT._serialized_end = 4101 + _SORT_SORTFIELD._serialized_start = 3705 + _SORT_SORTFIELD._serialized_end = 3893 + _SORT_SORTDIRECTION._serialized_start = 3895 + _SORT_SORTDIRECTION._serialized_end = 4003 + _SORT_SORTNULLS._serialized_start = 4005 + _SORT_SORTNULLS._serialized_end = 4087 + _DROP._serialized_start = 4103 + _DROP._serialized_end = 4203 + _DEDUPLICATE._serialized_start = 4206 + _DEDUPLICATE._serialized_end = 4377 + _LOCALRELATION._serialized_start = 4379 + _LOCALRELATION._serialized_end = 4472 + _SAMPLE._serialized_start = 4475 + _SAMPLE._serialized_end = 4699 + _RANGE._serialized_start = 4702 + _RANGE._serialized_end = 4847 + _SUBQUERYALIAS._serialized_start = 4849 + _SUBQUERYALIAS._serialized_end = 4963 + _REPARTITION._serialized_start = 4966 + _REPARTITION._serialized_end = 5108 + _SHOWSTRING._serialized_start = 5111 + _SHOWSTRING._serialized_end = 5252 + _STATSUMMARY._serialized_start = 5254 + _STATSUMMARY._serialized_end = 5346 + _STATCROSSTAB._serialized_start = 5348 + _STATCROSSTAB._serialized_end = 5449 + _NAFILL._serialized_start = 5452 + _NAFILL._serialized_end = 5586 + _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_start = 5588 + _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_end = 5702 + _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_start = 5705 + _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_end = 5964 + _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_start = 5897 + _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_end = 5964 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi index 27c6db4e7481..30e61282baaf 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.pyi +++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi @@ -79,6 +79,7 @@ class Relation(google.protobuf.message.Message): RENAME_COLUMNS_BY_SAME_LENGTH_NAMES_FIELD_NUMBER: builtins.int RENAME_COLUMNS_BY_NAME_TO_NAME_MAP_FIELD_NUMBER: builtins.int SHOW_STRING_FIELD_NUMBER: builtins.int + DROP_FIELD_NUMBER: builtins.int FILL_NA_FIELD_NUMBER: builtins.int SUMMARY_FIELD_NUMBER: builtins.int CROSSTAB_FIELD_NUMBER: builtins.int @@ -124,6 +125,8 @@ class Relation(google.protobuf.message.Message): @property def show_string(self) -> global___ShowString: ... @property + def drop(self) -> global___Drop: ... + @property def fill_na(self) -> global___NAFill: """NA functions""" @property @@ -156,6 +159,7 @@ class Relation(google.protobuf.message.Message): rename_columns_by_same_length_names: global___RenameColumnsBySameLengthNames | None = ..., rename_columns_by_name_to_name_map: global___RenameColumnsByNameToNameMap | None = ..., show_string: global___ShowString | None = ..., + drop: global___Drop | None = ..., fill_na: global___NAFill | None = ..., summary: global___StatSummary | None = ..., crosstab: global___StatCrosstab | None = ..., @@ -172,6 +176,8 @@ class Relation(google.protobuf.message.Message): b"crosstab", "deduplicate", b"deduplicate", + "drop", + b"drop", "fill_na", b"fill_na", "filter", @@ -227,6 +233,8 @@ class Relation(google.protobuf.message.Message): b"crosstab", "deduplicate", b"deduplicate", + "drop", + b"drop", "fill_na", b"fill_na", "filter", @@ -293,6 +301,7 @@ class Relation(google.protobuf.message.Message): "rename_columns_by_same_length_names", "rename_columns_by_name_to_name_map", "show_string", + "drop", "fill_na", "summary", "crosstab", @@ -961,6 +970,42 @@ class Sort(google.protobuf.message.Message): global___Sort = Sort +class Drop(google.protobuf.message.Message): + """Drop specified columns.""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + INPUT_FIELD_NUMBER: builtins.int + COLS_FIELD_NUMBER: builtins.int + @property + def input(self) -> global___Relation: + """(Required) The input relation.""" + @property + def cols( + self, + ) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[ + pyspark.sql.connect.proto.expressions_pb2.Expression + ]: + """(Required) columns to drop. + + Should contain at least 1 item. + """ + def __init__( + self, + *, + input: global___Relation | None = ..., + cols: collections.abc.Iterable[pyspark.sql.connect.proto.expressions_pb2.Expression] + | None = ..., + ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["input", b"input"] + ) -> builtins.bool: ... + def ClearField( + self, field_name: typing_extensions.Literal["cols", b"cols", "input", b"input"] + ) -> None: ... + +global___Drop = Drop + class Deduplicate(google.protobuf.message.Message): """Relation of type [[Deduplicate]] which have duplicate rows removed, could consider either only the subset of columns or all the columns. diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py index 9e7a5f2f4a54..61544d521406 100644 --- a/python/pyspark/sql/tests/connect/test_connect_basic.py +++ b/python/pyspark/sql/tests/connect/test_connect_basic.py @@ -188,6 +188,33 @@ def test_take(self) -> None: df2 = self.connect.read.table(self.tbl_name_empty) self.assertEqual(0, len(df2.take(5))) + def test_drop(self): + # SPARK-41169: test drop + query = """ + SELECT * FROM VALUES + (false, 1, NULL), (false, NULL, 2), (NULL, 3, 3) + AS tab(a, b, c) + """ + + cdf = self.connect.sql(query) + sdf = self.spark.sql(query) + self.assert_eq( + cdf.drop("a").toPandas(), + sdf.drop("a").toPandas(), + ) + self.assert_eq( + cdf.drop("a", "b").toPandas(), + sdf.drop("a", "b").toPandas(), + ) + self.assert_eq( + cdf.drop("a", "x").toPandas(), + sdf.drop("a", "x").toPandas(), + ) + self.assert_eq( + cdf.drop(cdf.a, cdf.x).toPandas(), + sdf.drop("a", "x").toPandas(), + ) + def test_subquery_alias(self) -> None: # SPARK-40938: test subquery alias. plan_text = ( diff --git a/python/pyspark/sql/tests/connect/test_connect_plan_only.py b/python/pyspark/sql/tests/connect/test_connect_plan_only.py index d560c8b893fc..26a8da46ac3f 100644 --- a/python/pyspark/sql/tests/connect/test_connect_plan_only.py +++ b/python/pyspark/sql/tests/connect/test_connect_plan_only.py @@ -181,6 +181,22 @@ def test_sort(self): ) self.assertEqual(plan.root.sort.is_global, False) + def test_drop(self): + # SPARK-41169: test drop + df = self.connect.readTable(table_name=self.tbl_name) + + plan = df.filter(df.col_name > 3).drop("col_a", "col_b")._plan.to_proto(self.connect) + self.assertEqual( + [f.unresolved_attribute.unparsed_identifier for f in plan.root.drop.cols], + ["col_a", "col_b"], + ) + + plan = df.filter(df.col_name > 3).drop(df.col_x, "col_b")._plan.to_proto(self.connect) + self.assertEqual( + [f.unresolved_attribute.unparsed_identifier for f in plan.root.drop.cols], + ["col_x", "col_b"], + ) + def test_deduplicate(self): df = self.connect.readTable(table_name=self.tbl_name)