Merge pull request #13 from zhzhan/orc1

scwf · scwf · commit 601d2420c844 · 2014-12-02T09:48:03.000+08:00
predictor pushdown support
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.hive
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.hive.ql.parse.ASTNode
+import org.apache.hadoop.mapreduce.Job
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
@@ -30,7 +32,7 @@ import org.apache.spark.sql.catalyst.types.StringType
 import org.apache.spark.sql.execution.{DescribeCommand, OutputFaker, SparkPlan}
 import org.apache.spark.sql.hive
 import org.apache.spark.sql.hive.execution._
-import org.apache.spark.sql.hive.orc.{WriteToOrcFile, InsertIntoOrcTable, OrcRelation, OrcTableScan}
+import org.apache.spark.sql.hive.orc._
 import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.{SQLContext, SchemaRDD, Strategy}
 
@@ -243,8 +245,36 @@ private[hive] trait HiveStrategies {
       case logical.InsertIntoTable(table: OrcRelation, partition, child, overwrite) =>
         InsertIntoOrcTable(table, planLater(child), overwrite) :: Nil
       case PhysicalOperation(projectList, filters, relation: OrcRelation) =>
-        // TODO: need to implement predict push down.
-        val prunePushedDownFilters = identity[Seq[Expression]] _
+        val prunePushedDownFilters = {
+          OrcRelation.jobConf =  sparkContext.hadoopConfiguration
+          if (ORC_FILTER_PUSHDOWN_ENABLED) {
+            val job = new Job(OrcRelation.jobConf)
+            val conf: Configuration = job.getConfiguration
+            logInfo("Orc push down filter enabled:" + filters)
+            (filters: Seq[Expression]) => {
+              val recordFilter = OrcFilters.createFilter(filters)
+              if (recordFilter.isDefined) {
+
+                logInfo("Parsed filters:" + recordFilter)
+                /**
+                 * To test it, we can set follows so that the reader
+                 * will not read whole file if small
+                 * sparkContext.hadoopConfiguration.setInt(
+                 * "mapreduce.input.fileinputformat.split.maxsize", 50)
+                 */
+                conf.set(SARG_PUSHDOWN, toKryo(recordFilter.get))
+                conf.setBoolean("hive.optimize.index.filter", true)
+                OrcRelation.jobConf = conf
+              }
+              // no matter whether it is filtered or not in orc,
+              // we need to do more fine grained filter
+              // in the upper layer, return all of them
+              filters
+            }
+          } else {
+            identity[Seq[Expression]] _
+          }
+        }
         pruneFilterProject(
           projectList,
           filters,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.Builder
+import org.apache.spark.Logging
+
+private[sql] object OrcFilters extends Logging {
+
+  def createFilter(expr: Seq[Expression]): Option[SearchArgument] = {
+    if (expr == null || expr.size == 0) return None
+    var sarg: Option[Builder] = Some(SearchArgument.FACTORY.newBuilder())
+    sarg.get.startAnd()
+    expr.foreach {
+      x => {
+        sarg match {
+          case Some(s1) => sarg = createFilter(x, s1)
+          case _ => None
+        }
+      }
+    }
+    sarg match {
+      case Some(b) => Some(b.end.build)
+      case _ => None
+    }
+  }
+
+  def createFilter(expression: Expression, builder: Builder): Option[Builder] = {
+    expression match {
+      case p@And(left: Expression, right: Expression) => {
+        val b1 = builder.startAnd()
+        val b2 = createFilter(left, b1)
+        b2 match {
+          case Some(b) => val b3 = createFilter(right, b)
+            if (b3.isDefined) {
+              Some(b3.get.end)
+            } else {
+              None
+            }
+          case _ => None
+        }
+      }
+      case p@Or(left: Expression, right: Expression) => {
+        val b1 = builder.startOr()
+        val b2 = createFilter(left, b1)
+        b2 match {
+          case Some(b) => val b3 = createFilter(right, b)
+            if (b3.isDefined) {
+              Some(b3.get.end)
+            } else {
+              None
+            }
+          case _ => None
+        }
+      }
+      case p@EqualTo(left: Literal, right: NamedExpression) => {
+        val b1 = builder.equals(right.name, left.value)
+        Some(b1)
+      }
+      case p@EqualTo(left: NamedExpression, right: Literal) => {
+        val b1 = builder.equals(left.name, right.value)
+        Some(b1)
+      }
+      case p@LessThan(left: NamedExpression, right: Literal) => {
+        val b1 = builder.lessThan(left.name, right.value)
+        Some(b1)
+      }
+      case p@LessThan(left: Literal, right: NamedExpression) => {
+        val b1 = builder.startNot().lessThanEquals(right.name, left.value).end()
+        Some(b1)
+      }
+      case p@LessThanOrEqual(left: NamedExpression, right: Literal) => {
+        val b1 = builder.lessThanEquals(left.name, right.value)
+        Some(b1)
+      }
+      case p@LessThanOrEqual(left: Literal, right: NamedExpression) => {
+        val b1 = builder.startNot().lessThan(right.name, left.value).end()
+        Some(b1)
+      }
+      case p@GreaterThan(left: NamedExpression, right: Literal) => {
+        val b1 = builder.startNot().lessThanEquals(left.name, right.value).end()
+        Some(b1)
+      }
+      case p@GreaterThan(left: Literal, right: NamedExpression) => {
+        val b1 = builder.lessThanEquals(right.name, left.value)
+        Some(b1)
+      }
+      case p@GreaterThanOrEqual(left: NamedExpression, right: Literal) => {
+        val b1 = builder.startNot().lessThan(left.name, right.value).end()
+        Some(b1)
+      }
+      case p@GreaterThanOrEqual(left: Literal, right: NamedExpression) => {
+        val b1 = builder.lessThan(right.name, left.value)
+        Some(b1)
+      }
+      // TODO: test it
+      case p@EqualNullSafe(left: NamedExpression, right: NamedExpression) => {
+        val b1 = builder.nullSafeEquals(left.name, right.name)
+        Some(b1)
+      }
+      case p@In(left: NamedExpression, list: Seq[Literal]) => {
+        val b1 = builder.in(left.name, list.map(_.value).toArray)
+        Some(b1)
+      }
+      case _ => None
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -94,6 +94,7 @@ private[sql] object OrcRelation {
     }
     path
   }
+  var jobConf: Configuration = _
 }
 
 private[sql] object OrcFileOperator{
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -77,8 +77,7 @@ case class OrcTableScan(
 
   override def execute(): RDD[Row] = {
     val sc = sqlContext.sparkContext
-    val job = new Job(sc.hadoopConfiguration)
-
+    val job = new Job(OrcRelation.jobConf) // sc.hadoopConfiguration)
     val conf: Configuration = job.getConfiguration
     relation.path.split(",").foreach { curPath =>
       val qualifiedPath = {
@@ -110,17 +109,15 @@ case class OrcTableScan(
    * @param conf
    */
   def addColumnIds(output: Seq[Attribute], relation: OrcRelation, conf: Configuration) {
-    val fieldIdMap = relation.output.map(_.name).zipWithIndex.toMap
-    val names = output.map(_.name)
-    val ids = output.map { att =>
-      val realName = att.name.toLowerCase(Locale.ENGLISH)
-      fieldIdMap.getOrElse(realName, -1)
-    }.filter(_ >= 0).map(_.asInstanceOf[Integer])
 
+    val ids =
+      output.map(a =>
+        relation.output.indexWhere(_.name == a.name): Integer)
+        .filter(_ >= 0)
+    val names = output.map(_.name)
     assert(ids.size == names.size, "columns id and name length does not match!")
-    if (ids != null && !ids.isEmpty) {
-      HiveShim.appendReadColumns(conf, ids, names)
-    }
+    val sorted = ids.zip(names).sorted
+    HiveShim.appendReadColumns(conf, sorted.map(_._1), sorted.map(_._2))
   }
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.hive
 
+import com.esotericsoftware.kryo.Kryo
+import com.esotericsoftware.kryo.io.Output
+import org.apache.commons.codec.binary.Base64
 import org.apache.spark.sql.{SQLContext, SchemaRDD}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.hadoop.hive.serde2.objectinspector._
@@ -46,6 +49,13 @@ package object orc {
   // for orc compression type, only take effect in hive 0.13.1
   val orcDefaultCompressVar = "hive.exec.orc.default.compress"
   // for prediction push down in hive-0.13.1, don't enable it
-  val ORC_FILTER_PUSHDOWN_ENABLED = false
+  val ORC_FILTER_PUSHDOWN_ENABLED = true
   val SARG_PUSHDOWN = "sarg.pushdown"
+
+  def toKryo(input: Any) = {
+    val out = new Output(4 * 1024, 10 * 1024 * 1024);
+    new Kryo().writeObject(out, input);
+    out.close();
+    Base64.encodeBase64String(out.toBytes());
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,7 @@ private[sql] object OrcRelation {`
`94`	`94`	`}`
`95`	`95`	`path`
`96`	`96`	`}`
	`97`	`+ var jobConf: Configuration = _`
`97`	`98`	`}`
`98`	`99`
`99`	`100`	`private[sql] object OrcFileOperator{`