diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 016e764f05cb..c3f2c49a446b 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -583,6 +583,12 @@ ], "sqlState" : "22KD3" }, + "CANNOT_USE_MULTI_ALIASES_IN_WATERMARK_CLAUSE" : { + "message" : [ + "Multiple aliases are not supported in watermark clause." + ], + "sqlState" : "42000" + }, "CANNOT_WRITE_STATE_STORE" : { "message" : [ "Error writing state store files for provider ." @@ -4985,6 +4991,12 @@ ], "sqlState" : "4274K" }, + "REQUIRES_EXPLICIT_NAME_IN_WATERMARK_CLAUSE" : { + "message" : [ + "The watermark clause requires an explicit name if expression is specified, but got ." + ], + "sqlState" : "42000" + }, "REQUIRES_SINGLE_PART_NAMESPACE" : { "message" : [ " requires a single-part namespace, but got ." diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index c82691ef4ee2..dcbe772dbe5c 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -497,6 +497,7 @@ Below is a list of all the keywords in Spark SQL. |DEFAULT|non-reserved|non-reserved|non-reserved| |DEFINED|non-reserved|non-reserved|non-reserved| |DEFINER|non-reserved|non-reserved|non-reserved| +|DELAY|non-reserved|non-reserved|non-reserved| |DELETE|non-reserved|non-reserved|reserved| |DELIMITED|non-reserved|non-reserved|non-reserved| |DESC|non-reserved|non-reserved|non-reserved| @@ -793,6 +794,7 @@ Below is a list of all the keywords in Spark SQL. |VIEW|non-reserved|non-reserved|non-reserved| |VIEWS|non-reserved|non-reserved|non-reserved| |VOID|non-reserved|non-reserved|non-reserved| +|WATERMARK|non-reserved|non-reserved|non-reserved| |WEEK|non-reserved|non-reserved|non-reserved| |WEEKS|non-reserved|non-reserved|non-reserved| |WHEN|reserved|non-reserved|reserved| diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 index 461af320097b..400461d2d497 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 @@ -206,6 +206,7 @@ DECLARE: 'DECLARE'; DEFAULT: 'DEFAULT'; DEFINED: 'DEFINED'; DEFINER: 'DEFINER'; +DELAY: 'DELAY'; DELETE: 'DELETE'; DELIMITED: 'DELIMITED'; DESC: 'DESC'; @@ -501,6 +502,7 @@ VERSION: 'VERSION'; VIEW: 'VIEW'; VIEWS: 'VIEWS'; VOID: 'VOID'; +WATERMARK: 'WATERMARK'; WEEK: 'WEEK'; WEEKS: 'WEEKS'; WHEN: 'WHEN'; diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 44cd1f04821b..18f262f817f8 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -382,8 +382,10 @@ createPipelineDatasetHeader ; streamRelationPrimary - : STREAM multipartIdentifier optionsClause? tableAlias #streamTableName - | STREAM LEFT_PAREN multipartIdentifier RIGHT_PAREN optionsClause? tableAlias #streamTableName + : STREAM multipartIdentifier optionsClause? watermarkClause? + tableAlias #streamTableName + | STREAM LEFT_PAREN multipartIdentifier RIGHT_PAREN + optionsClause? watermarkClause? tableAlias #streamTableName ; setResetStatement @@ -927,6 +929,10 @@ lateralView : LATERAL VIEW (OUTER)? qualifiedName LEFT_PAREN (expression (COMMA expression)*)? RIGHT_PAREN tblName=identifier (AS? colName+=identifier (COMMA colName+=identifier)*)? ; +watermarkClause + : WATERMARK colName=namedExpression DELAY OF delay=interval + ; + setQuantifier : DISTINCT | ALL @@ -1001,9 +1007,11 @@ identifierComment relationPrimary : streamRelationPrimary #streamRelation | identifierReference temporalClause? - optionsClause? sample? tableAlias #tableName - | LEFT_PAREN query RIGHT_PAREN sample? tableAlias #aliasedQuery - | LEFT_PAREN relation RIGHT_PAREN sample? tableAlias #aliasedRelation + optionsClause? sample? watermarkClause? tableAlias #tableName + | LEFT_PAREN query RIGHT_PAREN sample? watermarkClause? + tableAlias #aliasedQuery + | LEFT_PAREN relation RIGHT_PAREN sample? + watermarkClause? tableAlias #aliasedRelation | inlineTable #inlineTableDefault2 | functionTable #tableValuedFunction ; @@ -1012,6 +1020,8 @@ optionsClause : WITH options=propertyList ; +// Unlike all other types of expression for relation, we do not support watermarkClause for +// inlineTable. inlineTable : VALUES expression (COMMA expression)* tableAlias ; @@ -1048,10 +1058,13 @@ functionTableArgument | functionArgument ; +// This is only used in relationPrimary where having watermarkClause makes sense. If this becomes +// referred by other clause, please check wheter watermarkClause makes sense to the clause. +// If not, consider separate this rule. functionTable : funcName=functionName LEFT_PAREN (functionTableArgument (COMMA functionTableArgument)*)? - RIGHT_PAREN tableAlias + RIGHT_PAREN watermarkClause? tableAlias ; tableAlias @@ -1819,6 +1832,7 @@ ansiNonReserved | DEFAULT | DEFINED | DEFINER + | DELAY | DELETE | DELIMITED | DESC @@ -2063,6 +2077,7 @@ ansiNonReserved | WEEK | WEEKS | WHILE + | WATERMARK | WINDOW | WITHOUT | YEAR @@ -2188,6 +2203,7 @@ nonReserved | DEFAULT | DEFINED | DEFINER + | DELAY | DELETE | DELIMITED | DESC @@ -2469,6 +2485,7 @@ nonReserved | VIEW | VIEWS | VOID + | WATERMARK | WEEK | WEEKS | WHILE diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index efbf86c76af1..98c514925fa0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -636,6 +636,10 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor case c: CollectMetrics if c.child.resolved && AliasResolution.hasUnresolvedAlias(c.metrics) => c.copy(metrics = AliasResolution.assignAliases(c.metrics)) + + case u: UnresolvedEventTimeWatermark + if u.child.resolved && AliasResolution.hasUnresolvedAlias(Seq(u.eventTimeColExpr)) => + u.copy(eventTimeColExpr = AliasResolution.assignAliases(Seq(u.eventTimeColExpr)).head) } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveEventTimeWatermark.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveEventTimeWatermark.scala new file mode 100644 index 000000000000..ddef19d42bd7 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveEventTimeWatermark.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, LogicalPlan, Project} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.trees.TreePattern +import org.apache.spark.sql.catalyst.util.AUTO_GENERATED_ALIAS + +/** + * Resolve [[UnresolvedEventTimeWatermark]] to [[EventTimeWatermark]]. + */ +object ResolveEventTimeWatermark extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUpWithPruning( + _.containsPattern(TreePattern.UNRESOLVED_EVENT_TIME_WATERMARK), ruleId) { + + case u: UnresolvedEventTimeWatermark if u.eventTimeColExpr.resolved && u.childrenResolved => + if (u.eventTimeColExpr.metadata.contains(AUTO_GENERATED_ALIAS) && + u.eventTimeColExpr.metadata.getString(AUTO_GENERATED_ALIAS) == "true") { + throw new AnalysisException( + errorClass = "REQUIRES_EXPLICIT_NAME_IN_WATERMARK_CLAUSE", + messageParameters = Map("sqlExpr" -> u.eventTimeColExpr.sql) + ) + } + + val uuid = java.util.UUID.randomUUID() + + val attrRef = u.eventTimeColExpr.toAttribute + if (u.child.outputSet.contains(u.eventTimeColExpr)) { + // We don't need to have projection since the attribute being referenced will be available. + EventTimeWatermark(uuid, attrRef, u.delay, u.child) + } else { + // We need to inject projection as we can't find the matching column directly in the + // child output. + val proj = Project(Seq(u.eventTimeColExpr) ++ u.child.output, u.child) + EventTimeWatermark(uuid, attrRef, u.delay, proj) + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index b759c70266f7..3bdeb6d71884 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -31,6 +31,7 @@ import org.apache.spark.sql.connector.catalog.TableWritePrivilege import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.types.{DataType, Metadata, StructType} import org.apache.spark.sql.util.{CaseInsensitiveStringMap, SchemaUtils} +import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.ArrayImplicits._ /** @@ -1228,3 +1229,15 @@ case class UnresolvedExecuteImmediate( final override val nodePatterns: Seq[TreePattern] = Seq(EXECUTE_IMMEDIATE) } + +case class UnresolvedEventTimeWatermark( + eventTimeColExpr: NamedExpression, + delay: CalendarInterval, + child: LogicalPlan) + extends UnresolvedUnaryNode { + + final override val nodePatterns: Seq[TreePattern] = Seq(UNRESOLVED_EVENT_TIME_WATERMARK) + + override protected def withNewChildInternal( + newChild: LogicalPlan): UnresolvedEventTimeWatermark = copy(child = newChild) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 1d7cf5455e57..06f4489c4ab4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.trees.CurrentOrigin import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} /** * A collection of implicit conversions that create a DSL for constructing catalyst data structures. @@ -566,6 +566,19 @@ package object dsl extends SQLConfHelper { } def deduplicate(colNames: Attribute*): LogicalPlan = Deduplicate(colNames, logicalPlan) + + def withWatermark( + uuid: java.util.UUID, + expr: NamedExpression, + delayThreshold: CalendarInterval): LogicalPlan = { + EventTimeWatermark(uuid, expr.toAttribute, delayThreshold, logicalPlan) + } + + def unresolvedWithWatermark( + expr: NamedExpression, + delayThreshold: CalendarInterval): LogicalPlan = { + UnresolvedEventTimeWatermark(expr, delayThreshold, logicalPlan) + } } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index babcb7b7cf9b..c24a7dfd30a6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -29,6 +29,7 @@ import org.antlr.v4.runtime.tree.{ParseTree, RuleNode, TerminalNode} import org.apache.spark.{SparkArithmeticException, SparkException, SparkIllegalArgumentException, SparkThrowable, SparkThrowableHelper} import org.apache.spark.internal.Logging import org.apache.spark.internal.LogKeys.PARTITION_SPECIFICATION +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{FunctionIdentifier, SQLConfHelper, TableIdentifier} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FUNC_ALIAS @@ -2076,6 +2077,34 @@ class AstBuilder extends DataTypeAstBuilder query) } + /** + * Add an [[EventTimeWatermark]] to a logical plan. + */ + private def withWatermark( + ctx: WatermarkClauseContext, + query: LogicalPlan): LogicalPlan = withOrigin(ctx) { + val expression = visitNamedExpression(ctx.namedExpression()) + + val namedExpression = expression match { + // Need to check this earlier since MultiAlias is also a NamedExpression + case _: MultiAlias => + throw new AnalysisException( + errorClass = "CANNOT_USE_MULTI_ALIASES_IN_WATERMARK_CLAUSE", + messageParameters = Map() + ) + case e: NamedExpression => e + case e => UnresolvedAlias(e) + } + + val delayInterval = visitInterval(ctx.delay) + + val delay = IntervalUtils.fromIntervalString(delayInterval.toString) + require(!IntervalUtils.isNegative(delay), + s"delay threshold (${delayInterval.toString}) should not be negative.") + + UnresolvedEventTimeWatermark(namedExpression, delay, query) + } + /** * Create a single relation referenced in a FROM clause. This method is used when a part of the * join condition is nested, for example: @@ -2255,7 +2284,8 @@ class AstBuilder extends DataTypeAstBuilder val relation = createUnresolvedRelation(ctx.identifierReference, Option(ctx.optionsClause)) val table = mayApplyAliasPlan( ctx.tableAlias, relation.optionalMap(ctx.temporalClause)(withTimeTravel)) - table.optionalMap(ctx.sample)(withSample) + val sample = table.optionalMap(ctx.sample)(withSample) + sample.optionalMap(ctx.watermarkClause)(withWatermark) } override def visitVersion(ctx: VersionContext): Option[String] = { @@ -2395,7 +2425,9 @@ class AstBuilder extends DataTypeAstBuilder val tvfAliases = if (aliases.nonEmpty) UnresolvedTVFAliases(ident, tvf, aliases) else tvf - tvfAliases.optionalMap(func.tableAlias.strictIdentifier)(aliasPlan) + val watermarkClause = func.watermarkClause() + val tvfWithWatermark = tvfAliases.optionalMap(watermarkClause)(withWatermark) + tvfWithWatermark.optionalMap(func.tableAlias.strictIdentifier)(aliasPlan) }) } @@ -2407,7 +2439,9 @@ class AstBuilder extends DataTypeAstBuilder optionsClause = Option(ctx.optionsClause), writePrivileges = Seq.empty, isStreaming = true) - mayApplyAliasPlan(ctx.tableAlias, tableStreamingRelation) + + val tableWithWatermark = tableStreamingRelation.optionalMap(ctx.watermarkClause)(withWatermark) + mayApplyAliasPlan(ctx.tableAlias, tableWithWatermark) } /** @@ -2450,7 +2484,8 @@ class AstBuilder extends DataTypeAstBuilder */ override def visitAliasedRelation(ctx: AliasedRelationContext): LogicalPlan = withOrigin(ctx) { val relation = plan(ctx.relation).optionalMap(ctx.sample)(withSample) - mayApplyAliasPlan(ctx.tableAlias, relation) + val watermark = relation.optionalMap(ctx.watermarkClause)(withWatermark) + mayApplyAliasPlan(ctx.tableAlias, watermark) } /** @@ -2463,7 +2498,7 @@ class AstBuilder extends DataTypeAstBuilder */ override def visitAliasedQuery(ctx: AliasedQueryContext): LogicalPlan = withOrigin(ctx) { val relation = plan(ctx.query).optionalMap(ctx.sample)(withSample) - if (ctx.tableAlias.strictIdentifier == null) { + val alias = if (ctx.tableAlias.strictIdentifier == null) { // For un-aliased subqueries, use a default alias name that is not likely to conflict with // normal subquery names, so that parent operators can only access the columns in subquery by // unqualified names. Users can still use this special qualifier to access columns if they @@ -2472,6 +2507,7 @@ class AstBuilder extends DataTypeAstBuilder } else { mayApplyAliasPlan(ctx.tableAlias, relation) } + alias.optionalMap(ctx.watermarkClause)(withWatermark) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala index b9f15f3f951c..f094d7e93ec5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala @@ -86,6 +86,7 @@ object RuleIdCollection { "org.apache.spark.sql.catalyst.analysis.EliminateUnions" :: "org.apache.spark.sql.catalyst.analysis.ResolveCollationName" :: "org.apache.spark.sql.catalyst.analysis.ResolveDefaultColumns" :: + "org.apache.spark.sql.catalyst.analysis.ResolveEventTimeWatermark" :: "org.apache.spark.sql.catalyst.analysis.ResolveExecuteImmediate" :: "org.apache.spark.sql.catalyst.analysis.ResolveExpressionsWithNamePlaceholders" :: "org.apache.spark.sql.catalyst.analysis.ResolveGroupByAll" :: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala index 11105e404695..ba4e801ed0a6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala @@ -176,6 +176,7 @@ object TreePattern extends Enumeration { // Unresolved Plan patterns (Alphabetically ordered) val PLAN_WITH_UNRESOLVED_IDENTIFIER: Value = Value + val UNRESOLVED_EVENT_TIME_WATERMARK: Value = Value val UNRESOLVED_HAVING: Value = Value val UNRESOLVED_HINT: Value = Value val UNRESOLVED_FUNC: Value = Value diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveEventTimeWatermarkSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveEventTimeWatermarkSuite.scala new file mode 100644 index 000000000000..c443e7de91b8 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveEventTimeWatermarkSuite.scala @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.catalyst.analysis.TestRelations.streamingRelation +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ +import org.apache.spark.sql.catalyst.expressions.Alias +import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, LogicalPlan} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.unsafe.types.CalendarInterval + +class ResolveEventTimeWatermarkSuite extends AnalysisTest { + override protected def extendedAnalysisRules: Seq[Rule[LogicalPlan]] = { + ResolveEventTimeWatermark +: super.extendedAnalysisRules + } + + test("event time column expr refers to the column in child") { + val planBeforeRule = streamingRelation + .unresolvedWithWatermark($"ts", new CalendarInterval(0, 0, 1000)) + + val analyzed = getAnalyzer.execute(planBeforeRule) + + // EventTimeWatermark node has UUID, hence we can't simply compare the plan + // with expected shape of plan as a whole. + val uuid = java.util.UUID.randomUUID() + + val uuidInjectedAnalyzed = analyzed.transform { + case e: EventTimeWatermark => e.copy(nodeId = uuid) + } + + comparePlans( + uuidInjectedAnalyzed, + streamingRelation + .withWatermark( + uuid, + $"ts", + new CalendarInterval(0, 0, 1000) + ).analyze + ) + } + + test("event time column expr deduces a new column from alias") { + val planBeforeRule = streamingRelation + .unresolvedWithWatermark( + Alias( + UnresolvedFunction( + Seq("timestamp_seconds"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "event_time" + )(), + new CalendarInterval(0, 0, 1000)) + + val analyzed = getAnalyzer.execute(planBeforeRule) + + // EventTimeWatermark node has UUID, hence we can't simply compare the plan + // with expected shape of plan as a whole. + val uuid = java.util.UUID.randomUUID() + + val uuidInjectedAnalyzed = analyzed.transform { + case e: EventTimeWatermark => e.copy(nodeId = uuid) + } + + comparePlans( + uuidInjectedAnalyzed, + streamingRelation + .select( + Alias( + UnresolvedFunction( + Seq("timestamp_seconds"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "event_time" + )(), + // `*` will be resolved to `a`, `ts` + $"a", + $"ts" + ) + .withWatermark( + uuid, + $"event_time", + new CalendarInterval(0, 0, 1000) + ).analyze + ) + } + + test("event time column expr deduces a new column but the name is not explicitly given") { + val plan = streamingRelation + .unresolvedWithWatermark( + UnresolvedAlias( + UnresolvedFunction( + Seq("timestamp_seconds"), Seq(UnresolvedAttribute("a")), isDistinct = false) + ), + new CalendarInterval(0, 0, 1000)) + + import org.apache.spark.sql.AnalysisException + val exc = intercept[AnalysisException] { + getAnalyzer.execute(plan) + } + checkError( + exc, + condition = "REQUIRES_EXPLICIT_NAME_IN_WATERMARK_CLAUSE", + sqlState = "42000", + // The sqlExpr is updated to the auto generated alias + parameters = Map("sqlExpr" -> "timestamp_seconds(a) AS `timestamp_seconds(a)`") + ) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index c278d9c3f3be..59205f308471 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, RelationTimeTravel, import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.util.EvaluateUnresolvedInlineTable +import org.apache.spark.sql.catalyst.util.{EvaluateUnresolvedInlineTable, IntervalUtils} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{Decimal, DecimalType, IntegerType, LongType, StringType} import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -2017,4 +2017,150 @@ class PlanParserSuite extends AnalysisTest { assert(unresolvedRelation2.options == CaseInsensitiveStringMap.empty) assert(unresolvedRelation2.isStreaming) } + + test("watermark clause - table & attribute reference") { + assertEqual( + """ + |SELECT * + |FROM testData + |WATERMARK ts DELAY OF INTERVAL 10 seconds AS tbl + |WHERE a > 1 + |""".stripMargin, + table("testData") + .as("tbl") + .unresolvedWithWatermark( + UnresolvedAttribute("ts"), + IntervalUtils.fromIntervalString("INTERVAL 10 seconds")) + .where($"a" > 1) + .select(UnresolvedStar(None)) + ) + } + + test("watermark clause - table & expression with alias") { + assertEqual( + """ + |SELECT * + |FROM testData + |WATERMARK timestamp_seconds(value) AS eventTime DELAY OF INTERVAL 10 seconds AS tbl + |WHERE a > 1 + |""".stripMargin, + table("testData") + .as("tbl") + .unresolvedWithWatermark( + Alias( + UnresolvedFunction( + Seq("timestamp_seconds"), Seq(UnresolvedAttribute("value")), isDistinct = false), + "eventTime")(), + IntervalUtils.fromIntervalString("INTERVAL 10 seconds")) + .where($"a" > 1) + .select(UnresolvedStar(None)) + ) + } + + test("watermark clause - table & expression without alias") { + assertEqual( + """ + |SELECT * + |FROM testData + |WATERMARK timestamp_seconds(value) DELAY OF INTERVAL 10 seconds AS tbl + |WHERE a > 1 + |""".stripMargin, + table("testData") + .as("tbl") + .unresolvedWithWatermark( + UnresolvedAlias( + UnresolvedFunction( + Seq("timestamp_seconds"), Seq(UnresolvedAttribute("value")), isDistinct = false)), + IntervalUtils.fromIntervalString("INTERVAL 10 seconds")) + .where($"a" > 1) + .select(UnresolvedStar(None)) + ) + } + + test("watermark clause - aliased query") { + assertEqual( + """ + |SELECT * + |FROM + |( + | SELECT * + | FROM testData + |) + |WATERMARK ts DELAY OF INTERVAL 10 seconds AS tbl + |WHERE a > 1 + |""".stripMargin, + table("testData") + .select(UnresolvedStar(None)) + .as("tbl") + .unresolvedWithWatermark( + UnresolvedAttribute("ts"), + IntervalUtils.fromIntervalString("INTERVAL 10 seconds")) + .where($"a" > 1) + .select(UnresolvedStar(None)) + ) + } + + test("watermark clause - subquery") { + assertEqual( + """ + |SELECT key, time + |FROM + |( + | SELECT key, time + | FROM + | testData + | WATERMARK timestamp_seconds(ts) AS time DELAY OF INTERVAL 10 seconds + |) + |AS tbl + |WHERE key = 'a' + |""".stripMargin, + table("testData") + .unresolvedWithWatermark( + Alias( + UnresolvedFunction( + Seq("timestamp_seconds"), Seq(UnresolvedAttribute("ts")), isDistinct = false), + "time")(), + IntervalUtils.fromIntervalString("INTERVAL 10 seconds")) + .select($"key", $"time") + .as("tbl") + .where($"key" === "a") + .select($"key", $"time") + ) + } + + test("watermark clause - table valued function") { + assertEqual( + """ + |SELECT * + |FROM + |mock_tvf(1, 'a') + |WATERMARK ts DELAY OF INTERVAL 10 seconds AS dst + |WHERE a > 1 + |""".stripMargin, + UnresolvedTableValuedFunction("mock_tvf", Seq(Literal(1), Literal("a"))) + .unresolvedWithWatermark( + UnresolvedAttribute("ts"), + IntervalUtils.fromIntervalString("INTERVAL 10 seconds")) + .as("dst") + .where($"a" > 1) + .select(UnresolvedStar(None)) + ) + } + + test("watermark clause - inline table (not allowed)") { + val query = """ + |SELECT * + |FROM + |VALUES (1, 1), (2, 2) + |WATERMARK ts DELAY OF INTERVAL 10 seconds AS dst + |WHERE a > 1 + |""".stripMargin + checkError( + exception = parseException(query), + condition = "PARSE_SYNTAX_ERROR", + parameters = Map("error" -> "'ts'", "hint" -> "")) + } + + private def intercept(sqlCommand: String, messages: String*): Unit = + interceptParseException(parsePlan)(sqlCommand, messages: _*)() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/classic/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/classic/Dataset.scala index 30bbe3dd9c10..59bcc864ac81 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/classic/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/classic/Dataset.scala @@ -595,9 +595,8 @@ class Dataset[T] private[sql]( val parsedDelay = IntervalUtils.fromIntervalString(delayThreshold) require(!IntervalUtils.isNegative(parsedDelay), s"delay threshold ($delayThreshold) should not be negative.") - EliminateEventTimeWatermark( - EventTimeWatermark(util.UUID.randomUUID(), UnresolvedAttribute(eventTime), - parsedDelay, logicalPlan)) + EventTimeWatermark(util.UUID.randomUUID(), UnresolvedAttribute(eventTime), + parsedDelay, logicalPlan) } /** @inheritdoc */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index 5abb1e75543a..c967497b660c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.internal import org.apache.spark.annotation.Unstable import org.apache.spark.sql.{DataSourceRegistration, ExperimentalMethods, SparkSessionExtensions, UDTFRegistration} import org.apache.spark.sql.artifact.ArtifactManager -import org.apache.spark.sql.catalyst.analysis.{Analyzer, EvalSubqueriesForTimeTravel, FunctionRegistry, InvokeProcedures, ReplaceCharWithVarchar, ResolveDataSource, ResolveExecuteImmediate, ResolveSessionCatalog, ResolveTranspose, TableFunctionRegistry} +import org.apache.spark.sql.catalyst.analysis.{Analyzer, EvalSubqueriesForTimeTravel, FunctionRegistry, InvokeProcedures, ReplaceCharWithVarchar, ResolveDataSource, ResolveEventTimeWatermark, ResolveExecuteImmediate, ResolveSessionCatalog, ResolveTranspose, TableFunctionRegistry} import org.apache.spark.sql.catalyst.analysis.resolver.ResolverExtension import org.apache.spark.sql.catalyst.catalog.{FunctionExpressionBuilder, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Expression, ExtractSemiStructuredFields} @@ -246,6 +246,7 @@ abstract class BaseSessionStateBuilder( new InvokeProcedures(session) +: ResolveExecuteImmediate(session, this.catalogManager) +: ExtractSemiStructuredFields +: + ResolveEventTimeWatermark +: customResolutionRules override val postHocResolutionRules: Seq[Rule[LogicalPlan]] = diff --git a/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out index a067d2c53d05..b8443e417caf 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out @@ -92,6 +92,7 @@ DECLARE false DEFAULT false DEFINED false DEFINER false +DELAY false DELETE false DELIMITED false DESC false @@ -384,6 +385,7 @@ VERSION false VIEW false VIEWS false VOID false +WATERMARK false WEEK false WEEKS false WHEN true diff --git a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out index 93822c6c6b75..00baa0c7e725 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out @@ -92,6 +92,7 @@ DECLARE false DEFAULT false DEFINED false DEFINER false +DELAY false DELETE false DELIMITED false DESC false @@ -384,6 +385,7 @@ VERSION false VIEW false VIEWS false VOID false +WATERMARK false WEEK false WEEKS false WHEN false diff --git a/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out index 93822c6c6b75..00baa0c7e725 100644 --- a/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out @@ -92,6 +92,7 @@ DECLARE false DEFAULT false DEFINED false DEFINER false +DELAY false DELETE false DELIMITED false DESC false @@ -384,6 +385,7 @@ VERSION false VIEW false VIEWS false VOID false +WATERMARK false WEEK false WEEKS false WHEN false diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkWithWatermarkDefInSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkWithWatermarkDefInSelectSuite.scala new file mode 100644 index 000000000000..9fcdf5cfab8a --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkWithWatermarkDefInSelectSuite.scala @@ -0,0 +1,267 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.streaming + +import java.{util => ju} +import java.text.SimpleDateFormat + +import org.scalatest.BeforeAndAfter + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.UTC +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.functions.{col, timestamp_seconds} +import org.apache.spark.sql.streaming.StateStoreMetricsTest + +class EventTimeWatermarkWithWatermarkDefInSelectSuite + extends StateStoreMetricsTest + with BeforeAndAfter + with Logging { + + import testImplicits._ + + after { + sqlContext.streams.active.foreach(_.stop()) + } + + test("event time and watermark metrics with watermark in select statement - case 1") { + // All event time metrics where watermarking is set + val inputData = MemoryStream[Int] + val df = inputData.toDF() + .withColumn("eventTime", timestamp_seconds(col("value"))) + df.createOrReplaceTempView("stream_src") + val aggWithWatermark = spark.sql( + """ + |SELECT + | CAST(window.start AS LONG), CAST(count(*) AS LONG) AS count + |FROM + | stream_src WATERMARK eventTime DELAY OF INTERVAL 10 seconds + |GROUP BY window(eventTime, '5 seconds') + |""".stripMargin) + + testWindowedAggregation(inputData, aggWithWatermark) + } + + test("event time and watermark metrics with watermark in select statement - case 2") { + // All event time metrics where watermarking is set + val inputData = MemoryStream[Int] + val df = inputData.toDF() + df.createOrReplaceTempView("stream_src") + val aggWithWatermark = spark.sql( + """ + |SELECT + | CAST(window.start AS LONG), CAST(count(*) AS LONG) AS count + |FROM + | stream_src + | WATERMARK timestamp_seconds(value) AS eventTime DELAY OF INTERVAL 10 seconds + |GROUP BY window(eventTime, '5 seconds') + |""".stripMargin) + + testWindowedAggregation(inputData, aggWithWatermark) + } + + private def testWindowedAggregation( + inputData: MemoryStream[Int], + dataFrame: DataFrame): Unit = { + testStream(dataFrame)( + AddData(inputData, 15), + CheckAnswer(), + assertEventStats(min = 15, max = 15, avg = 15, wtrmark = 0), + AddData(inputData, 10, 12, 14), + CheckAnswer(), + assertEventStats(min = 10, max = 14, avg = 12, wtrmark = 5), + AddData(inputData, 25), + CheckAnswer((10, 3)), + assertEventStats(min = 25, max = 25, avg = 25, wtrmark = 5) + ) + } + + test("stream-stream join with watermark in select statement - case 1") { + val leftInput = MemoryStream[(Int, Int)] + val rightInput = MemoryStream[(Int, Int)] + + val df1 = leftInput.toDF().toDF("leftKey", "time") + .select($"leftKey", timestamp_seconds($"time") as "leftTime", + ($"leftKey" * 2) as "leftValue") + val df2 = rightInput.toDF().toDF("rightKey", "time") + .select($"rightKey", timestamp_seconds($"time") as "rightTime", + ($"rightKey" * 3) as "rightValue") + + df1.createOrReplaceTempView("stream_left") + df2.createOrReplaceTempView("stream_right") + + val joined = spark.sql( + """ + |SELECT + | leftKey, rightKey, CAST(leftTime AS INTEGER), CAST(rightTime AS INTEGER) + |FROM + | stream_left WATERMARK leftTime DELAY OF INTERVAL 0 second + |FULL OUTER JOIN + | stream_right WATERMARK rightTime DELAY OF INTERVAL 0 second + |ON + | leftKey = rightKey AND leftTime BETWEEN rightTime - INTERVAL 5 SECONDS + | AND rightTime + INTERVAL 5 SECONDS + |""".stripMargin) + + testStreamStreamTimeIntervalJoin(leftInput, rightInput, joined) + } + + test("stream-stream join with watermark in select statement - case 2") { + val leftInput = MemoryStream[(Int, Int)] + val rightInput = MemoryStream[(Int, Int)] + + val df1 = leftInput.toDF().toDF("leftKey", "time") + val df2 = rightInput.toDF().toDF("rightKey", "time") + + df1.createOrReplaceTempView("stream_left") + df2.createOrReplaceTempView("stream_right") + + val joined = spark.sql( + """ + |SELECT + | leftKey, rightKey, CAST(leftTime AS INTEGER), CAST(rightTime AS INTEGER) + |FROM + |( + | SELECT + | leftKey, leftTime, leftKey * 2 AS leftValue + | FROM + | stream_left + | WATERMARK timestamp_seconds(time) AS leftTime DELAY OF INTERVAL 0 second + |) + |FULL OUTER JOIN + |( + | SELECT + | rightKey, rightTime, rightKey * 3 AS rightValue + | FROM + | stream_right + | WATERMARK timestamp_seconds(time) AS rightTime DELAY OF INTERVAL 0 second + |) + |ON + | leftKey = rightKey AND leftTime BETWEEN rightTime - INTERVAL 5 SECONDS + | AND rightTime + INTERVAL 5 SECONDS + |""".stripMargin) + + testStreamStreamTimeIntervalJoin(leftInput, rightInput, joined) + } + + private def testStreamStreamTimeIntervalJoin( + leftInput: MemoryStream[(Int, Int)], + rightInput: MemoryStream[(Int, Int)], + dataFrame: DataFrame): Unit = { + testStream(dataFrame)( + AddData(leftInput, (1, 5), (3, 5)), + CheckNewAnswer(), + // states + // left: (1, 5), (3, 5) + // right: nothing + assertNumStateRows(total = 2, updated = 2), + AddData(rightInput, (1, 10), (2, 5)), + // Match left row in the state. + CheckNewAnswer(Row(1, 1, 5, 10)), + // states + // left: (1, 5), (3, 5) + // right: (1, 10), (2, 5) + assertNumStateRows(total = 4, updated = 2), + AddData(rightInput, (1, 9)), + // Match left row in the state. + CheckNewAnswer(Row(1, 1, 5, 9)), + // states + // left: (1, 5), (3, 5) + // right: (1, 10), (2, 5), (1, 9) + assertNumStateRows(total = 5, updated = 1), + // Increase event time watermark to 20s by adding data with time = 30s on both inputs. + AddData(leftInput, (1, 7), (1, 30)), + CheckNewAnswer(Row(1, 1, 7, 9), Row(1, 1, 7, 10)), + // states + // left: (1, 5), (3, 5), (1, 7), (1, 30) + // right: (1, 10), (2, 5), (1, 9) + assertNumStateRows(total = 7, updated = 2), + // Watermark = 30 - 10 = 20, no matched row. + // Generate outer join result for all non-matched rows when the watermark advances. + AddData(rightInput, (0, 30)), + CheckNewAnswer(Row(3, null, 5, null), Row(null, 2, null, 5)), + // states + // left: (1, 30) + // right: (0, 30) + // + // states evicted + // left: (1, 5), (3, 5), (1, 5) (below watermark = 20) + // right: (1, 10), (2, 5), (1, 9) (below watermark = 20) + assertNumStateRows(total = 2, updated = 1) + ) + } + + test("stream-batch join followed by time window aggregation") { + val inputData = MemoryStream[Int] + val df = inputData.toDF() + .withColumn("eventTime", timestamp_seconds(col("value"))) + df.createOrReplaceTempView("stream_src") + + val batchDf = spark.range(0, 50).map { i => + if (i % 2 == 0) (i, "even") else (i, "odd") + }.toDF("value", "batch_value") + batchDf.createOrReplaceTempView("batch_src") + + val agg = spark.sql( + """ + |SELECT + | CAST(window.start AS LONG), batch_value, CAST(count(*) AS LONG) AS count + |FROM + | stream_src WATERMARK eventTime DELAY OF INTERVAL 10 seconds + |JOIN + | batch_src + |ON + | stream_src.value = batch_src.value + |GROUP BY batch_src.batch_value, window(eventTime, '5 seconds') + |""".stripMargin) + + testStream(agg)( + AddData(inputData, 15), + CheckAnswer(), + AddData(inputData, 10, 11, 14), + CheckAnswer(), + AddData(inputData, 25), + CheckAnswer((10, "even", 2), (10, "odd", 1)) + ) + } + + /** Assert event stats generated on that last batch with data in it */ + private def assertEventStats(body: ju.Map[String, String] => Unit): AssertOnQuery = { + Execute("AssertEventStats") { q => + body(q.recentProgress.filter(_.numInputRows > 0).lastOption.get.eventTime) + } + } + + /** Assert event stats generated on that last batch with data in it */ + private def assertEventStats(min: Long, max: Long, avg: Double, wtrmark: Long): AssertOnQuery = { + assertEventStats { e => + assert(e.get("min") === formatTimestamp(min), s"min value mismatch") + assert(e.get("max") === formatTimestamp(max), s"max value mismatch") + assert(e.get("avg") === formatTimestamp(avg.toLong), s"avg value mismatch") + assert(e.get("watermark") === formatTimestamp(wtrmark), s"watermark value mismatch") + } + } + + private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 + timestampFormat.setTimeZone(ju.TimeZone.getTimeZone(UTC)) + + private def formatTimestamp(sec: Long): String = { + timestampFormat.format(new ju.Date(sec * 1000)) + } +} diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala index fd015330e8de..d69f99a1e42f 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala @@ -214,7 +214,7 @@ trait ThriftServerWithSparkContextSuite extends SharedThriftServer { val sessionHandle = client.openSession(user, "") val infoValue = client.getInfo(sessionHandle, GetInfoType.CLI_ODBC_KEYWORDS) // scalastyle:off line.size.limit - assert(infoValue.getStringValue == "ADD,AFTER,AGGREGATE,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,ATOMIC,AUTHORIZATION,BEGIN,BETWEEN,BIGINT,BINARY,BINDING,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CALL,CALLED,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONDITION,CONSTRAINT,CONTAINS,CONTINUE,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFINED,DEFINER,DELETE,DELIMITED,DESC,DESCRIBE,DETERMINISTIC,DFS,DIRECTORIES,DIRECTORY,DISTINCT,DISTRIBUTE,DIV,DO,DOUBLE,DROP,ELSE,ELSEIF,END,ENFORCED,ESCAPE,ESCAPED,EVOLUTION,EXCEPT,EXCHANGE,EXCLUDE,EXECUTE,EXISTS,EXIT,EXPLAIN,EXPORT,EXTEND,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FLOW,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FOUND,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GEOGRAPHY,GEOMETRY,GLOBAL,GRANT,GROUP,GROUPING,HANDLER,HAVING,HOUR,HOURS,IDENTIFIER,IDENTITY,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INCREMENT,INDEX,INDEXES,INNER,INPATH,INPUT,INPUTFORMAT,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,INVOKER,IS,ITEMS,ITERATE,JOIN,JSON,KEY,KEYS,LANGUAGE,LAST,LATERAL,LAZY,LEADING,LEAVE,LEFT,LEVEL,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MATERIALIZED,MAX,MERGE,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MODIFIES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NO,NONE,NORELY,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PERCENT,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROCEDURE,PROCEDURES,PROPERTIES,PURGE,QUARTER,QUERY,RANGE,READS,REAL,RECORDREADER,RECORDWRITER,RECOVER,RECURSION,RECURSIVE,REDUCE,REFERENCES,REFRESH,RELY,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,RETURN,RETURNS,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SECURITY,SELECT,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,SPECIFIC,SQL,SQLEXCEPTION,SQLSTATE,START,STATISTICS,STORED,STRATIFY,STREAM,STREAMING,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UNTIL,UPDATE,USE,USER,USING,VALUE,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WEEK,WEEKS,WHEN,WHERE,WHILE,WINDOW,WITH,WITHIN,WITHOUT,X,YEAR,YEARS,ZONE") + assert(infoValue.getStringValue == "ADD,AFTER,AGGREGATE,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,ATOMIC,AUTHORIZATION,BEGIN,BETWEEN,BIGINT,BINARY,BINDING,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CALL,CALLED,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONDITION,CONSTRAINT,CONTAINS,CONTINUE,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFINED,DEFINER,DELAY,DELETE,DELIMITED,DESC,DESCRIBE,DETERMINISTIC,DFS,DIRECTORIES,DIRECTORY,DISTINCT,DISTRIBUTE,DIV,DO,DOUBLE,DROP,ELSE,ELSEIF,END,ENFORCED,ESCAPE,ESCAPED,EVOLUTION,EXCEPT,EXCHANGE,EXCLUDE,EXECUTE,EXISTS,EXIT,EXPLAIN,EXPORT,EXTEND,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FLOW,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FOUND,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GEOGRAPHY,GEOMETRY,GLOBAL,GRANT,GROUP,GROUPING,HANDLER,HAVING,HOUR,HOURS,IDENTIFIER,IDENTITY,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INCREMENT,INDEX,INDEXES,INNER,INPATH,INPUT,INPUTFORMAT,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,INVOKER,IS,ITEMS,ITERATE,JOIN,JSON,KEY,KEYS,LANGUAGE,LAST,LATERAL,LAZY,LEADING,LEAVE,LEFT,LEVEL,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MATERIALIZED,MAX,MERGE,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MODIFIES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NO,NONE,NORELY,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PERCENT,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROCEDURE,PROCEDURES,PROPERTIES,PURGE,QUARTER,QUERY,RANGE,READS,REAL,RECORDREADER,RECORDWRITER,RECOVER,RECURSION,RECURSIVE,REDUCE,REFERENCES,REFRESH,RELY,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,RETURN,RETURNS,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SECURITY,SELECT,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,SPECIFIC,SQL,SQLEXCEPTION,SQLSTATE,START,STATISTICS,STORED,STRATIFY,STREAM,STREAMING,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UNTIL,UPDATE,USE,USER,USING,VALUE,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WATERMARK,WEEK,WEEKS,WHEN,WHERE,WHILE,WINDOW,WITH,WITHIN,WITHOUT,X,YEAR,YEARS,ZONE") // scalastyle:on line.size.limit } } diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/SqlPipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/SqlPipelineSuite.scala index 950df9167926..b522615f928a 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/SqlPipelineSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/SqlPipelineSuite.scala @@ -1006,4 +1006,58 @@ class SqlPipelineSuite extends PipelineTest with SharedSparkSession { parameters = Map.empty ) } + + test("Streaming Table with watermark clause") { + withTempDir { tmpDir => + spark.sql("SELECT * FROM RANGE(3)").write.format("parquet").mode("append") + .save(tmpDir.getCanonicalPath) + + val externalTableIdent = fullyQualifiedIdentifier("t") + spark.sql(s"CREATE TABLE $externalTableIdent (id string, eventTime timestamp)") + + withTable(externalTableIdent.quotedString) { + spark.sql(s"INSERT INTO $externalTableIdent VALUES ('a', timestamp_seconds(1))") + spark.sql(s"INSERT INTO $externalTableIdent VALUES ('b', timestamp_seconds(2))") + spark.sql(s"INSERT INTO $externalTableIdent VALUES ('a', timestamp_seconds(3))") + + val unresolvedDataflowGraph = unresolvedDataflowGraphFromSql( + sqlText = + s""" + |CREATE STREAMING TABLE b + |AS + |SELECT + | CAST(window.start AS LONG) AS wStart, + | CAST(window.end AS LONG) AS wEnd, + | id, + | count(*) as cnt + |FROM + | STREAM $externalTableIdent WATERMARK eventTime DELAY OF INTERVAL 10 seconds + |GROUP BY window(eventTime, '5 seconds'), id + |""".stripMargin + ) + + val updateContext = new PipelineUpdateContextImpl( + unresolvedDataflowGraph, eventCallback = _ => (), + storageRoot = storageRoot) + updateContext.pipelineExecution.runPipeline() + updateContext.pipelineExecution.awaitCompletion() + + val datasetFullyQualifiedName = fullyQualifiedIdentifier("b").quotedString + + assert( + spark.sql(s"SELECT * FROM $datasetFullyQualifiedName").collect().toSet == Set() + ) + + spark.sql(s"INSERT INTO $externalTableIdent VALUES ('a', timestamp_seconds(20))") + + updateContext.pipelineExecution.runPipeline() + updateContext.pipelineExecution.awaitCompletion() + + checkAnswer( + spark.sql(s"SELECT * FROM $datasetFullyQualifiedName ORDER BY wStart, wEnd, id"), + Seq(Row(0L, 5L, "a", 2L), Row(0L, 5L, "b", 1L)) + ) + } + } + } }