apache · MaxGekk · Jan 24, 2020 · Jan 25, 2020 · Jan 25, 2020 · Jan 25, 2020
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/StructFilters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/StructFilters.scala
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst
+
+import scala.util.Try
+
+import org.apache.spark.sql.catalyst.StructFilters._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.sources
+import org.apache.spark.sql.types.{BooleanType, StructType}
+
+/**
+ * The class provides API for applying pushed down filters to partially or
+ * fully set internal rows that have the struct schema.
+ *
+ * `StructFilters` assumes that:
+ *   - `reset()` is called before any `skipRow()` calls for new row.
+ *
+ * @param pushedFilters The pushed down source filters. The filters should refer to
+ *                      the fields of the provided schema.
+ * @param schema The required schema of records from datasource files.
+ */
+abstract class StructFilters(pushedFilters: Seq[sources.Filter], schema: StructType) {
+
+  protected val filters = StructFilters.pushedFilters(pushedFilters.toArray, schema)
+
+  /**
+   * Applies pushed down source filters to the given row assuming that
+   * value at `index` has been already set.
+   *
+   * @param row The row with fully or partially set values.
+   * @param index The index of already set value.
+   * @return `true` if currently processed row can be skipped otherwise false.
+   */
+  def skipRow(row: InternalRow, index: Int): Boolean
+
+  /**
+   * Resets states of pushed down filters. The method must be called before
+   * precessing any new row otherwise `skipRow()` may return wrong result.
+   */
+  def reset(): Unit
+
+  /**
+   * Compiles source filters to a predicate.
+   */
+  def toPredicate(filters: Seq[sources.Filter]): BasePredicate = {
+    val reducedExpr = filters
+      .sortBy(_.references.length)
+      .flatMap(filterToExpression(_, toRef))
+      .reduce(And)
+    Predicate.create(reducedExpr)
+  }
+
+  // Finds a filter attribute in the schema and converts it to a `BoundReference`
+  private def toRef(attr: String): Option[BoundReference] = {
+    // The names have been normalized and case sensitivity is not a concern here.
+    schema.getFieldIndex(attr).map { index =>
+      val field = schema(index)
+      BoundReference(index, field.dataType, field.nullable)
+    }
+  }
+}
+
+object StructFilters {
+  private def checkFilterRefs(filter: sources.Filter, fieldNames: Set[String]): Boolean = {
+    // The names have been normalized and case sensitivity is not a concern here.
+    filter.references.forall(fieldNames.contains)
+  }
+
+  /**
+   * Returns the filters currently supported by the datasource.
+   * @param filters The filters pushed down to the datasource.
+   * @param schema data schema of datasource files.
+   * @return a sub-set of `filters` that can be handled by the datasource.
+   */
+  def pushedFilters(filters: Array[sources.Filter], schema: StructType): Array[sources.Filter] = {
+    val fieldNames = schema.fieldNames.toSet
+    filters.filter(checkFilterRefs(_, fieldNames))
+  }
+
+  private def zip[A, B](a: Option[A], b: Option[B]): Option[(A, B)] = {
+    a.zip(b).headOption
+  }
+
+  private def toLiteral(value: Any): Option[Literal] = {
+    Try(Literal(value)).toOption
+  }
+
+  /**
+   * Converts a filter to an expression and binds it to row positions.
+   *
+   * @param filter The filter to convert.
+   * @param toRef The function converts a filter attribute to a bound reference.
+   * @return some expression with resolved attributes or `None` if the conversion
+   *         of the given filter to an expression is impossible.
+   */
+  def filterToExpression(
+      filter: sources.Filter,
+      toRef: String => Option[BoundReference]): Option[Expression] = {
+    def zipAttributeAndValue(name: String, value: Any): Option[(BoundReference, Literal)] = {
+      zip(toRef(name), toLiteral(value))
+    }
+    def translate(filter: sources.Filter): Option[Expression] = filter match {
+      case sources.And(left, right) =>
+        zip(translate(left), translate(right)).map(And.tupled)
+      case sources.Or(left, right) =>
+        zip(translate(left), translate(right)).map(Or.tupled)
+      case sources.Not(child) =>
+        translate(child).map(Not)
+      case sources.EqualTo(attribute, value) =>
+        zipAttributeAndValue(attribute, value).map(EqualTo.tupled)
+      case sources.EqualNullSafe(attribute, value) =>
+        zipAttributeAndValue(attribute, value).map(EqualNullSafe.tupled)
+      case sources.IsNull(attribute) =>
+        toRef(attribute).map(IsNull)
+      case sources.IsNotNull(attribute) =>
+        toRef(attribute).map(IsNotNull)
+      case sources.In(attribute, values) =>
+        val literals = values.toSeq.flatMap(toLiteral)
+        if (literals.length == values.length) {
+          toRef(attribute).map(In(_, literals))
+        } else {
+          None
+        }
+      case sources.GreaterThan(attribute, value) =>
+        zipAttributeAndValue(attribute, value).map(GreaterThan.tupled)
+      case sources.GreaterThanOrEqual(attribute, value) =>
+        zipAttributeAndValue(attribute, value).map(GreaterThanOrEqual.tupled)
+      case sources.LessThan(attribute, value) =>
+        zipAttributeAndValue(attribute, value).map(LessThan.tupled)
+      case sources.LessThanOrEqual(attribute, value) =>
+        zipAttributeAndValue(attribute, value).map(LessThanOrEqual.tupled)
+      case sources.StringContains(attribute, value) =>
+        zipAttributeAndValue(attribute, value).map(Contains.tupled)
+      case sources.StringStartsWith(attribute, value) =>
+        zipAttributeAndValue(attribute, value).map(StartsWith.tupled)
+      case sources.StringEndsWith(attribute, value) =>
+        zipAttributeAndValue(attribute, value).map(EndsWith.tupled)
+      case sources.AlwaysTrue() =>
+        Some(Literal(true, BooleanType))
+      case sources.AlwaysFalse() =>
+        Some(Literal(false, BooleanType))
+    }
+    translate(filter)
+  }
+}
+
+class NoopFilters extends StructFilters(Seq.empty, new StructType()) {
+  override def skipRow(row: InternalRow, index: Int): Boolean = false
+  override def reset(): Unit = {}
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVFilters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVFilters.scala
@@ -17,13 +17,11 @@
 
 package org.apache.spark.sql.catalyst.csv
 
-import scala.util.Try
-
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.{InternalRow, StructFilters}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources
-import org.apache.spark.sql.types.{BooleanType, StructType}
+import org.apache.spark.sql.types.StructType
 
 /**
  * An instance of the class compiles filters to predicates and allows to
@@ -33,7 +31,8 @@ import org.apache.spark.sql.types.{BooleanType, StructType}
  * @param filters The filters pushed down to CSV datasource.
  * @param requiredSchema The schema with only fields requested by the upper layer.
  */
-class CSVFilters(filters: Seq[sources.Filter], requiredSchema: StructType) {
+class CSVFilters(filters: Seq[sources.Filter], requiredSchema: StructType)
+  extends StructFilters(filters, requiredSchema) {
   /**
    * Converted filters to predicates and grouped by maximum field index
    * in the read schema. For example, if an filter refers to 2 attributes
@@ -54,138 +53,49 @@ class CSVFilters(filters: Seq[sources.Filter], requiredSchema: StructType) {
       for (filter <- filters) {
         val refs = filter.references
         val index = if (refs.isEmpty) {
-          // For example, AlwaysTrue and AlwaysFalse doesn't have any references
+          // For example, `AlwaysTrue` and `AlwaysFalse` doesn't have any references
           // Filters w/o refs always return the same result. Taking into account
-          // that predicates are combined via And, we can apply such filters only
+          // that predicates are combined via `And`, we can apply such filters only
           // once at the position 0.
           0
         } else {
           // readSchema must contain attributes of all filters.
-          // Accordingly, fieldIndex() returns a valid index always.
+          // Accordingly, `fieldIndex()` returns a valid index always.
           refs.map(requiredSchema.fieldIndex).max
         }
         groupedFilters(index) :+= filter
       }
       if (len > 0 && !groupedFilters(0).isEmpty) {
-        // We assume that filters w/o refs like AlwaysTrue and AlwaysFalse
+        // We assume that filters w/o refs like `AlwaysTrue` and `AlwaysFalse`
         // can be evaluated faster that others. We put them in front of others.
         val (literals, others) = groupedFilters(0).partition(_.references.isEmpty)
         groupedFilters(0) = literals ++ others
       }
       for (i <- 0 until len) {
         if (!groupedFilters(i).isEmpty) {
-          val reducedExpr = groupedFilters(i)
-            .flatMap(CSVFilters.filterToExpression(_, toRef))
-            .reduce(And)
-          groupedPredicates(i) = Predicate.create(reducedExpr)
+          groupedPredicates(i) = toPredicate(groupedFilters(i))
         }
       }
     }
     groupedPredicates
   }
 
   /**
-   * Applies all filters that refer to row fields at the positions from 0 to index.
+   * Applies all filters that refer to row fields at the positions from 0 to `index`.
    * @param row The internal row to check.
    * @param index Maximum field index. The function assumes that all fields
-   *              from 0 to index position are set.
-   * @return false iff row fields at the position from 0 to index pass filters
+   *              from 0 to `index` position are set.
+   * @return false` iff row fields at the position from 0 to `index` pass filters
    *         or there are no applicable filters
-   *         otherwise false if at least one of the filters returns false.
+   *         otherwise `false` if at least one of the filters returns `false`.
    */
   def skipRow(row: InternalRow, index: Int): Boolean = {
     val predicate = predicates(index)
     predicate != null && !predicate.eval(row)
   }
 
-  // Finds a filter attribute in the read schema and converts it to a `BoundReference`
-  private def toRef(attr: String): Option[BoundReference] = {
-    requiredSchema.getFieldIndex(attr).map { index =>
-      val field = requiredSchema(index)
-      BoundReference(requiredSchema.fieldIndex(attr), field.dataType, field.nullable)
-    }
-  }
-}
-
-object CSVFilters {
-  private def checkFilterRefs(filter: sources.Filter, schema: StructType): Boolean = {
-    val fieldNames = schema.fields.map(_.name).toSet
-    filter.references.forall(fieldNames.contains(_))
-  }
-
-  /**
-   * Returns the filters currently supported by CSV datasource.
-   * @param filters The filters pushed down to CSV datasource.
-   * @param schema data schema of CSV files.
-   * @return a sub-set of `filters` that can be handled by CSV datasource.
-   */
-  def pushedFilters(filters: Array[sources.Filter], schema: StructType): Array[sources.Filter] = {
-    filters.filter(checkFilterRefs(_, schema))
-  }
-
-  private def zip[A, B](a: Option[A], b: Option[B]): Option[(A, B)] = {
-    a.zip(b).headOption
-  }
-
-  private def toLiteral(value: Any): Option[Literal] = {
-    Try(Literal(value)).toOption
-  }
-
-  /**
-   * Converts a filter to an expression and binds it to row positions.
-   *
-   * @param filter The filter to convert.
-   * @param toRef The function converts a filter attribute to a bound reference.
-   * @return some expression with resolved attributes or None if the conversion
-   *         of the given filter to an expression is impossible.
-   */
-  def filterToExpression(
-      filter: sources.Filter,
-      toRef: String => Option[BoundReference]): Option[Expression] = {
-    def zipAttributeAndValue(name: String, value: Any): Option[(BoundReference, Literal)] = {
-      zip(toRef(name), toLiteral(value))
-    }
-    def translate(filter: sources.Filter): Option[Expression] = filter match {
-      case sources.And(left, right) =>
-        zip(translate(left), translate(right)).map(And.tupled)
-      case sources.Or(left, right) =>
-        zip(translate(left), translate(right)).map(Or.tupled)
-      case sources.Not(child) =>
-        translate(child).map(Not)
-      case sources.EqualTo(attribute, value) =>
-        zipAttributeAndValue(attribute, value).map(EqualTo.tupled)
-      case sources.EqualNullSafe(attribute, value) =>
-        zipAttributeAndValue(attribute, value).map(EqualNullSafe.tupled)
-      case sources.IsNull(attribute) =>
-        toRef(attribute).map(IsNull)
-      case sources.IsNotNull(attribute) =>
-        toRef(attribute).map(IsNotNull)
-      case sources.In(attribute, values) =>
-        val literals = values.toSeq.flatMap(toLiteral)
-        if (literals.length == values.length) {
-          toRef(attribute).map(In(_, literals))
-        } else {
-          None
-        }
-      case sources.GreaterThan(attribute, value) =>
-        zipAttributeAndValue(attribute, value).map(GreaterThan.tupled)
-      case sources.GreaterThanOrEqual(attribute, value) =>
-        zipAttributeAndValue(attribute, value).map(GreaterThanOrEqual.tupled)
-      case sources.LessThan(attribute, value) =>
-        zipAttributeAndValue(attribute, value).map(LessThan.tupled)
-      case sources.LessThanOrEqual(attribute, value) =>
-        zipAttributeAndValue(attribute, value).map(LessThanOrEqual.tupled)
-      case sources.StringContains(attribute, value) =>
-        zipAttributeAndValue(attribute, value).map(Contains.tupled)
-      case sources.StringStartsWith(attribute, value) =>
-        zipAttributeAndValue(attribute, value).map(StartsWith.tupled)
-      case sources.StringEndsWith(attribute, value) =>
-        zipAttributeAndValue(attribute, value).map(EndsWith.tupled)
-      case sources.AlwaysTrue() =>
-        Some(Literal(true, BooleanType))
-      case sources.AlwaysFalse() =>
-        Some(Literal(false, BooleanType))
-    }
-    translate(filter)
-  }
+  // CSV filters are applied sequentially, and no need to track which filter references
+  // point out to already set row values. The `reset()` method is trivial because
+  // the filters don't have any states.
+  def reset(): Unit = {}
 }