-
Notifications
You must be signed in to change notification settings - Fork 3k
Spark 4.0: Refactor Spark procedures to consistently use ProcedureInput for parameter handling. #13913
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Spark 4.0: Refactor Spark procedures to consistently use ProcedureInput for parameter handling. #13913
Changes from 8 commits
bcf5ba1
4ec4257
ffff03a
f039c67
0771881
7b008c7
f718641
16a415c
e578b8f
2e5f73d
8804cae
01db4b0
84b609a
f5c1597
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,6 @@ | |
| import org.apache.iceberg.spark.actions.ExpireSnapshotsSparkAction; | ||
| import org.apache.iceberg.spark.actions.SparkActions; | ||
| import org.apache.iceberg.spark.procedures.SparkProcedures.ProcedureBuilder; | ||
| import org.apache.iceberg.util.DateTimeUtil; | ||
| import org.apache.spark.sql.catalyst.InternalRow; | ||
| import org.apache.spark.sql.connector.catalog.Identifier; | ||
| import org.apache.spark.sql.connector.catalog.TableCatalog; | ||
|
|
@@ -51,15 +50,30 @@ public class ExpireSnapshotsProcedure extends BaseProcedure { | |
|
|
||
| private static final Logger LOG = LoggerFactory.getLogger(ExpireSnapshotsProcedure.class); | ||
|
|
||
| private static final ProcedureParameter TABLE_PARAM = | ||
| requiredInParameter("table", DataTypes.StringType); | ||
| private static final ProcedureParameter OLDER_THAN_PARAM = | ||
| optionalInParameter("older_than", DataTypes.TimestampType); | ||
| private static final ProcedureParameter RETAIN_LAST_PARAM = | ||
| optionalInParameter("retain_last", DataTypes.IntegerType); | ||
| private static final ProcedureParameter MAX_CONCURRENT_DELETES_PARAM = | ||
| optionalInParameter("max_concurrent_deletes", DataTypes.IntegerType); | ||
| private static final ProcedureParameter STREAM_RESULTS_PARAM = | ||
| optionalInParameter("stream_results", DataTypes.BooleanType); | ||
| private static final ProcedureParameter SNAPSHOT_IDS_PARAM = | ||
| optionalInParameter("snapshot_ids", DataTypes.createArrayType(DataTypes.LongType)); | ||
| private static final ProcedureParameter CLEAN_EXPIRED_METADATA_PARAM = | ||
| optionalInParameter("clean_expired_metadata", DataTypes.BooleanType); | ||
|
|
||
| private static final ProcedureParameter[] PARAMETERS = | ||
| new ProcedureParameter[] { | ||
| requiredInParameter("table", DataTypes.StringType), | ||
| optionalInParameter("older_than", DataTypes.TimestampType), | ||
| optionalInParameter("retain_last", DataTypes.IntegerType), | ||
| optionalInParameter("max_concurrent_deletes", DataTypes.IntegerType), | ||
| optionalInParameter("stream_results", DataTypes.BooleanType), | ||
| optionalInParameter("snapshot_ids", DataTypes.createArrayType(DataTypes.LongType)), | ||
| optionalInParameter("clean_expired_metadata", DataTypes.BooleanType) | ||
| TABLE_PARAM, | ||
| OLDER_THAN_PARAM, | ||
| RETAIN_LAST_PARAM, | ||
| MAX_CONCURRENT_DELETES_PARAM, | ||
| STREAM_RESULTS_PARAM, | ||
| SNAPSHOT_IDS_PARAM, | ||
| CLEAN_EXPIRED_METADATA_PARAM | ||
| }; | ||
|
|
||
| private static final StructType OUTPUT_TYPE = | ||
|
|
@@ -104,13 +118,14 @@ public ProcedureParameter[] parameters() { | |
| @Override | ||
| @SuppressWarnings("checkstyle:CyclomaticComplexity") | ||
| public Iterator<Scan> call(InternalRow args) { | ||
| Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); | ||
| Long olderThanMillis = args.isNullAt(1) ? null : DateTimeUtil.microsToMillis(args.getLong(1)); | ||
| Integer retainLastNum = args.isNullAt(2) ? null : args.getInt(2); | ||
| Integer maxConcurrentDeletes = args.isNullAt(3) ? null : args.getInt(3); | ||
| Boolean streamResult = args.isNullAt(4) ? null : args.getBoolean(4); | ||
| long[] snapshotIds = args.isNullAt(5) ? null : args.getArray(5).toLongArray(); | ||
| Boolean cleanExpiredMetadata = args.isNullAt(6) ? null : args.getBoolean(6); | ||
| ProcedureInput input = new ProcedureInput(spark(), tableCatalog(), PARAMETERS, args); | ||
| Identifier tableIdent = input.ident(TABLE_PARAM); | ||
| Long olderThanMillis = input.asTimestampLong(OLDER_THAN_PARAM, null); | ||
| Integer retainLastNum = input.asInt(RETAIN_LAST_PARAM, null); | ||
| Integer maxConcurrentDeletes = input.asInt(MAX_CONCURRENT_DELETES_PARAM, null); | ||
| boolean streamResult = input.asBoolean(STREAM_RESULTS_PARAM, false); | ||
| Long[] snapshotIds = input.asLongArray(SNAPSHOT_IDS_PARAM, null); | ||
| boolean cleanExpiredMetadata = input.asBoolean(CLEAN_EXPIRED_METADATA_PARAM, false); | ||
|
||
|
|
||
| Preconditions.checkArgument( | ||
| maxConcurrentDeletes == null || maxConcurrentDeletes > 0, | ||
|
|
@@ -150,14 +165,9 @@ public Iterator<Scan> call(InternalRow args) { | |
| } | ||
| } | ||
|
|
||
| if (streamResult != null) { | ||
| action.option( | ||
| ExpireSnapshotsSparkAction.STREAM_RESULTS, Boolean.toString(streamResult)); | ||
| } | ||
| action.option(ExpireSnapshotsSparkAction.STREAM_RESULTS, Boolean.toString(streamResult)); | ||
|
|
||
| if (cleanExpiredMetadata != null) { | ||
| action.cleanExpiredMetadata(cleanExpiredMetadata); | ||
| } | ||
| action.cleanExpiredMetadata(cleanExpiredMetadata); | ||
|
|
||
| ExpireSnapshots.Result result = action.execute(); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -22,10 +22,12 @@ | |||||
| import java.util.Map; | ||||||
| import java.util.function.BiFunction; | ||||||
| import org.apache.commons.lang3.StringUtils; | ||||||
| import org.apache.iceberg.actions.DeleteOrphanFiles; | ||||||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||||||
| import org.apache.iceberg.relocated.com.google.common.collect.Maps; | ||||||
| import org.apache.iceberg.spark.Spark3Util; | ||||||
| import org.apache.iceberg.spark.Spark3Util.CatalogAndIdentifier; | ||||||
| import org.apache.iceberg.util.DateTimeUtil; | ||||||
| import org.apache.spark.sql.SparkSession; | ||||||
| import org.apache.spark.sql.catalyst.InternalRow; | ||||||
| import org.apache.spark.sql.catalyst.util.ArrayData; | ||||||
|
|
@@ -80,6 +82,22 @@ public Integer asInt(ProcedureParameter param, Integer defaultValue) { | |||||
| return args.isNullAt(ordinal) ? defaultValue : (Integer) args.getInt(ordinal); | ||||||
| } | ||||||
|
|
||||||
| public long asTimestampLong(ProcedureParameter param) { | ||||||
|
||||||
| public long asTimestampLong(ProcedureParameter param) { | |
| public long asTimestampMillis(ProcedureParameter param) { |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this should be called asTimestampMillis
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for reviewing the code. I will make the adjustments based on your suggestions.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we should put this method here as it's only used in a single place
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This part of the code has already been improved.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why not make those return
long[]instead ofLong[]?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for your suggestion. I have adjusted the return value to
long[]to maintain consistency with the original usage.